| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1977, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007590853022108359, | |
| "grad_norm": 0.3484795391559601, | |
| "learning_rate": 2.0202020202020206e-06, | |
| "loss": 1.3363, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015181706044216719, | |
| "grad_norm": 0.22640460729599, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 1.2804, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02277255906632508, | |
| "grad_norm": 0.6324455142021179, | |
| "learning_rate": 7.0707070707070704e-06, | |
| "loss": 1.4853, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.030363412088433437, | |
| "grad_norm": 0.22372472286224365, | |
| "learning_rate": 9.595959595959595e-06, | |
| "loss": 1.4012, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0379542651105418, | |
| "grad_norm": 0.36963725090026855, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 1.3461, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04554511813265016, | |
| "grad_norm": 0.6136270761489868, | |
| "learning_rate": 1.4646464646464647e-05, | |
| "loss": 1.2617, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.053135971154758516, | |
| "grad_norm": 0.20270676910877228, | |
| "learning_rate": 1.7171717171717173e-05, | |
| "loss": 1.1316, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.060726824176866874, | |
| "grad_norm": 0.7699366807937622, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 1.0233, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06831767719897523, | |
| "grad_norm": 0.1857442855834961, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.9781, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0759085302210836, | |
| "grad_norm": 0.18821729719638824, | |
| "learning_rate": 2.474747474747475e-05, | |
| "loss": 0.9322, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08349938324319195, | |
| "grad_norm": 0.2683129906654358, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.921, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.09109023626530031, | |
| "grad_norm": 0.17556847631931305, | |
| "learning_rate": 2.9797979797979796e-05, | |
| "loss": 0.7989, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09868108928740867, | |
| "grad_norm": 0.13853396475315094, | |
| "learning_rate": 3.232323232323233e-05, | |
| "loss": 0.8641, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10627194230951703, | |
| "grad_norm": 0.20103883743286133, | |
| "learning_rate": 3.484848484848485e-05, | |
| "loss": 0.8642, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1138627953316254, | |
| "grad_norm": 0.3428489565849304, | |
| "learning_rate": 3.7373737373737376e-05, | |
| "loss": 0.9104, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12145364835373375, | |
| "grad_norm": 0.17233586311340332, | |
| "learning_rate": 3.98989898989899e-05, | |
| "loss": 0.7537, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1290445013758421, | |
| "grad_norm": 0.15092326700687408, | |
| "learning_rate": 4.242424242424243e-05, | |
| "loss": 0.6822, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.13663535439795046, | |
| "grad_norm": 0.2307768166065216, | |
| "learning_rate": 4.494949494949495e-05, | |
| "loss": 0.6993, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14422620742005882, | |
| "grad_norm": 0.30504322052001953, | |
| "learning_rate": 4.7474747474747476e-05, | |
| "loss": 0.6094, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1518170604421672, | |
| "grad_norm": 0.4996664524078369, | |
| "learning_rate": 5e-05, | |
| "loss": 0.742, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15940791346427555, | |
| "grad_norm": 0.14200103282928467, | |
| "learning_rate": 5.2525252525252536e-05, | |
| "loss": 0.5918, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1669987664863839, | |
| "grad_norm": 0.15189217031002045, | |
| "learning_rate": 5.5050505050505056e-05, | |
| "loss": 0.5884, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.17458961950849228, | |
| "grad_norm": 0.25142595171928406, | |
| "learning_rate": 5.757575757575758e-05, | |
| "loss": 0.4829, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.18218047253060063, | |
| "grad_norm": 0.2258097231388092, | |
| "learning_rate": 6.01010101010101e-05, | |
| "loss": 0.6297, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.18977132555270898, | |
| "grad_norm": 0.3805074095726013, | |
| "learning_rate": 6.262626262626264e-05, | |
| "loss": 0.5932, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.19736217857481733, | |
| "grad_norm": 0.24002455174922943, | |
| "learning_rate": 6.515151515151516e-05, | |
| "loss": 0.6145, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2049530315969257, | |
| "grad_norm": 0.2202799916267395, | |
| "learning_rate": 6.767676767676769e-05, | |
| "loss": 0.5086, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.21254388461903406, | |
| "grad_norm": 0.2224930077791214, | |
| "learning_rate": 7.020202020202021e-05, | |
| "loss": 0.4833, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22013473764114241, | |
| "grad_norm": 0.2609390914440155, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 0.5377, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2277255906632508, | |
| "grad_norm": 0.10272058844566345, | |
| "learning_rate": 7.525252525252525e-05, | |
| "loss": 0.438, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.23531644368535914, | |
| "grad_norm": 0.289846807718277, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 0.5454, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2429072967074675, | |
| "grad_norm": 0.23176854848861694, | |
| "learning_rate": 8.03030303030303e-05, | |
| "loss": 0.5868, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.25049814972957585, | |
| "grad_norm": 0.24970294535160065, | |
| "learning_rate": 8.282828282828283e-05, | |
| "loss": 0.5515, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2580890027516842, | |
| "grad_norm": 0.3183782398700714, | |
| "learning_rate": 8.535353535353535e-05, | |
| "loss": 0.4979, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2656798557737926, | |
| "grad_norm": 0.3254294693470001, | |
| "learning_rate": 8.787878787878789e-05, | |
| "loss": 0.4923, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.27327070879590093, | |
| "grad_norm": 0.41256552934646606, | |
| "learning_rate": 9.040404040404041e-05, | |
| "loss": 0.4654, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2808615618180093, | |
| "grad_norm": 0.21282930672168732, | |
| "learning_rate": 9.292929292929293e-05, | |
| "loss": 0.4763, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.28845241484011763, | |
| "grad_norm": 0.13954608142375946, | |
| "learning_rate": 9.545454545454546e-05, | |
| "loss": 0.4511, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.296043267862226, | |
| "grad_norm": 0.33055710792541504, | |
| "learning_rate": 9.797979797979798e-05, | |
| "loss": 0.4938, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3036341208843344, | |
| "grad_norm": 0.2626458704471588, | |
| "learning_rate": 9.999992203714313e-05, | |
| "loss": 0.4624, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3112249739064427, | |
| "grad_norm": 0.2240830659866333, | |
| "learning_rate": 9.999719336268101e-05, | |
| "loss": 0.4803, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.3188158269285511, | |
| "grad_norm": 0.20912809669971466, | |
| "learning_rate": 9.999056678850014e-05, | |
| "loss": 0.4875, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3264066799506595, | |
| "grad_norm": 0.3838413655757904, | |
| "learning_rate": 9.9980042831224e-05, | |
| "loss": 0.6046, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3339975329727678, | |
| "grad_norm": 0.2766261398792267, | |
| "learning_rate": 9.996562231132523e-05, | |
| "loss": 0.4424, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3415883859948762, | |
| "grad_norm": 0.24665701389312744, | |
| "learning_rate": 9.994730635306174e-05, | |
| "loss": 0.427, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.34917923901698456, | |
| "grad_norm": 0.09618967026472092, | |
| "learning_rate": 9.992509638438907e-05, | |
| "loss": 0.4912, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3567700920390929, | |
| "grad_norm": 0.09648390114307404, | |
| "learning_rate": 9.9898994136849e-05, | |
| "loss": 0.5337, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.36436094506120126, | |
| "grad_norm": 0.38818395137786865, | |
| "learning_rate": 9.986900164543467e-05, | |
| "loss": 0.5244, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.37195179808330964, | |
| "grad_norm": 0.1550498604774475, | |
| "learning_rate": 9.983512124843177e-05, | |
| "loss": 0.4584, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.37954265110541796, | |
| "grad_norm": 0.19514597952365875, | |
| "learning_rate": 9.97973555872364e-05, | |
| "loss": 0.4544, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.38713350412752634, | |
| "grad_norm": 0.229110985994339, | |
| "learning_rate": 9.975570760614902e-05, | |
| "loss": 0.4177, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.39472435714963466, | |
| "grad_norm": 0.157244473695755, | |
| "learning_rate": 9.971018055214496e-05, | |
| "loss": 0.435, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.40231521017174304, | |
| "grad_norm": 0.1850701868534088, | |
| "learning_rate": 9.966077797462129e-05, | |
| "loss": 0.4151, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.4099060631938514, | |
| "grad_norm": 0.29395997524261475, | |
| "learning_rate": 9.960750372512007e-05, | |
| "loss": 0.413, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.41749691621595975, | |
| "grad_norm": 0.13114891946315765, | |
| "learning_rate": 9.955036195702806e-05, | |
| "loss": 0.4548, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.4250877692380681, | |
| "grad_norm": 0.1735813319683075, | |
| "learning_rate": 9.948935712525299e-05, | |
| "loss": 0.4605, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4326786222601765, | |
| "grad_norm": 0.3185722529888153, | |
| "learning_rate": 9.942449398587616e-05, | |
| "loss": 0.556, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.44026947528228483, | |
| "grad_norm": 0.09232311695814133, | |
| "learning_rate": 9.935577759578167e-05, | |
| "loss": 0.3765, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4478603283043932, | |
| "grad_norm": 0.1775842010974884, | |
| "learning_rate": 9.928321331226219e-05, | |
| "loss": 0.5862, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4554511813265016, | |
| "grad_norm": 0.17507719993591309, | |
| "learning_rate": 9.92068067926013e-05, | |
| "loss": 0.5954, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4630420343486099, | |
| "grad_norm": 0.1269141137599945, | |
| "learning_rate": 9.912656399363238e-05, | |
| "loss": 0.4592, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4706328873707183, | |
| "grad_norm": 0.2526402771472931, | |
| "learning_rate": 9.904249117127428e-05, | |
| "loss": 0.4719, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.47822374039282667, | |
| "grad_norm": 0.09530569612979889, | |
| "learning_rate": 9.895459488004356e-05, | |
| "loss": 0.3575, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.485814593414935, | |
| "grad_norm": 0.1187816858291626, | |
| "learning_rate": 9.886288197254341e-05, | |
| "loss": 0.4027, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.49340544643704337, | |
| "grad_norm": 0.1907995641231537, | |
| "learning_rate": 9.876735959892953e-05, | |
| "loss": 0.3528, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5009962994591517, | |
| "grad_norm": 0.13331271708011627, | |
| "learning_rate": 9.866803520635262e-05, | |
| "loss": 0.4405, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5085871524812601, | |
| "grad_norm": 0.08372137695550919, | |
| "learning_rate": 9.856491653837776e-05, | |
| "loss": 0.4757, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5161780055033685, | |
| "grad_norm": 0.2553092837333679, | |
| "learning_rate": 9.84580116343808e-05, | |
| "loss": 0.3777, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5237688585254768, | |
| "grad_norm": 0.36431610584259033, | |
| "learning_rate": 9.834732882892146e-05, | |
| "loss": 0.4213, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5313597115475852, | |
| "grad_norm": 0.19020870327949524, | |
| "learning_rate": 9.823287675109365e-05, | |
| "loss": 0.485, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5389505645696935, | |
| "grad_norm": 0.2752246856689453, | |
| "learning_rate": 9.811466432385267e-05, | |
| "loss": 0.4534, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5465414175918019, | |
| "grad_norm": 0.13446789979934692, | |
| "learning_rate": 9.79927007633196e-05, | |
| "loss": 0.3736, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5541322706139102, | |
| "grad_norm": 0.3601730167865753, | |
| "learning_rate": 9.786699557806274e-05, | |
| "loss": 0.3864, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5617231236360186, | |
| "grad_norm": 0.28543752431869507, | |
| "learning_rate": 9.773755856835631e-05, | |
| "loss": 0.4452, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.569313976658127, | |
| "grad_norm": 0.17038831114768982, | |
| "learning_rate": 9.760439982541646e-05, | |
| "loss": 0.4114, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5769048296802353, | |
| "grad_norm": 0.08299808204174042, | |
| "learning_rate": 9.746752973061446e-05, | |
| "loss": 0.4296, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5844956827023436, | |
| "grad_norm": 0.23236173391342163, | |
| "learning_rate": 9.732695895466735e-05, | |
| "loss": 0.4577, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.592086535724452, | |
| "grad_norm": 0.15669377148151398, | |
| "learning_rate": 9.718269845680606e-05, | |
| "loss": 0.4083, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5996773887465604, | |
| "grad_norm": 0.13645882904529572, | |
| "learning_rate": 9.7034759483921e-05, | |
| "loss": 0.3649, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6072682417686688, | |
| "grad_norm": 0.05155817046761513, | |
| "learning_rate": 9.688315356968521e-05, | |
| "loss": 0.3618, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6148590947907772, | |
| "grad_norm": 0.11633949726819992, | |
| "learning_rate": 9.672789253365515e-05, | |
| "loss": 0.4005, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6224499478128854, | |
| "grad_norm": 0.18201440572738647, | |
| "learning_rate": 9.656898848034926e-05, | |
| "loss": 0.4173, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6300408008349938, | |
| "grad_norm": 0.16578234732151031, | |
| "learning_rate": 9.640645379830424e-05, | |
| "loss": 0.3481, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6376316538571022, | |
| "grad_norm": 0.2896673083305359, | |
| "learning_rate": 9.624030115910919e-05, | |
| "loss": 0.3855, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6452225068792106, | |
| "grad_norm": 0.24677187204360962, | |
| "learning_rate": 9.607054351641778e-05, | |
| "loss": 0.4146, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.652813359901319, | |
| "grad_norm": 0.0908162072300911, | |
| "learning_rate": 9.589719410493822e-05, | |
| "loss": 0.4494, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6604042129234273, | |
| "grad_norm": 0.07764331251382828, | |
| "learning_rate": 9.572026643940161e-05, | |
| "loss": 0.3942, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6679950659455356, | |
| "grad_norm": 0.20974747836589813, | |
| "learning_rate": 9.553977431350816e-05, | |
| "loss": 0.385, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.675585918967644, | |
| "grad_norm": 0.17720717191696167, | |
| "learning_rate": 9.535573179885191e-05, | |
| "loss": 0.368, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6831767719897524, | |
| "grad_norm": 0.24612246453762054, | |
| "learning_rate": 9.516815324382356e-05, | |
| "loss": 0.3519, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6907676250118607, | |
| "grad_norm": 0.2337830364704132, | |
| "learning_rate": 9.497705327249198e-05, | |
| "loss": 0.4045, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6983584780339691, | |
| "grad_norm": 0.20068958401679993, | |
| "learning_rate": 9.47824467834639e-05, | |
| "loss": 0.3792, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7059493310560774, | |
| "grad_norm": 0.2409381866455078, | |
| "learning_rate": 9.45843489487226e-05, | |
| "loss": 0.3082, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7135401840781858, | |
| "grad_norm": 0.36345911026000977, | |
| "learning_rate": 9.438277521244489e-05, | |
| "loss": 0.3868, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7211310371002941, | |
| "grad_norm": 0.24875889718532562, | |
| "learning_rate": 9.417774128979706e-05, | |
| "loss": 0.3718, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7287218901224025, | |
| "grad_norm": 0.11670625954866409, | |
| "learning_rate": 9.39692631657098e-05, | |
| "loss": 0.4664, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7363127431445109, | |
| "grad_norm": 0.1045941710472107, | |
| "learning_rate": 9.375735709363189e-05, | |
| "loss": 0.4966, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7439035961666193, | |
| "grad_norm": 0.22171644866466522, | |
| "learning_rate": 9.354203959426304e-05, | |
| "loss": 0.3507, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7514944491887275, | |
| "grad_norm": 0.20189598202705383, | |
| "learning_rate": 9.332332745426596e-05, | |
| "loss": 0.5197, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7590853022108359, | |
| "grad_norm": 0.1306353658437729, | |
| "learning_rate": 9.310123772495756e-05, | |
| "loss": 0.4887, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7590853022108359, | |
| "eval_loss": 0.6389759182929993, | |
| "eval_runtime": 241.6392, | |
| "eval_samples_per_second": 9.696, | |
| "eval_steps_per_second": 9.696, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7666761552329443, | |
| "grad_norm": 0.1597890555858612, | |
| "learning_rate": 9.28757877209796e-05, | |
| "loss": 0.4177, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7742670082550527, | |
| "grad_norm": 0.07365977764129639, | |
| "learning_rate": 9.264699501894887e-05, | |
| "loss": 0.3627, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7818578612771611, | |
| "grad_norm": 0.18954086303710938, | |
| "learning_rate": 9.241487745608681e-05, | |
| "loss": 0.3507, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7894487142992693, | |
| "grad_norm": 0.0904935896396637, | |
| "learning_rate": 9.217945312882888e-05, | |
| "loss": 0.4617, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7970395673213777, | |
| "grad_norm": 0.0969192162156105, | |
| "learning_rate": 9.19407403914137e-05, | |
| "loss": 0.4213, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8046304203434861, | |
| "grad_norm": 0.28966036438941956, | |
| "learning_rate": 9.16987578544522e-05, | |
| "loss": 0.3588, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8122212733655945, | |
| "grad_norm": 0.305105984210968, | |
| "learning_rate": 9.145352438347662e-05, | |
| "loss": 0.3805, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8198121263877028, | |
| "grad_norm": 0.09918319433927536, | |
| "learning_rate": 9.12050590974697e-05, | |
| "loss": 0.4213, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8274029794098112, | |
| "grad_norm": 0.2274104654788971, | |
| "learning_rate": 9.095338136737413e-05, | |
| "loss": 0.3999, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8349938324319195, | |
| "grad_norm": 0.4053689241409302, | |
| "learning_rate": 9.069851081458242e-05, | |
| "loss": 0.3478, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8425846854540279, | |
| "grad_norm": 0.1169477105140686, | |
| "learning_rate": 9.044046730940705e-05, | |
| "loss": 0.3894, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8501755384761362, | |
| "grad_norm": 0.13794220983982086, | |
| "learning_rate": 9.017927096953139e-05, | |
| "loss": 0.4559, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8577663914982446, | |
| "grad_norm": 0.059803109616041183, | |
| "learning_rate": 8.991494215844132e-05, | |
| "loss": 0.4398, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.865357244520353, | |
| "grad_norm": 0.1535421907901764, | |
| "learning_rate": 8.964750148383756e-05, | |
| "loss": 0.396, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8729480975424614, | |
| "grad_norm": 0.07398340851068497, | |
| "learning_rate": 8.937696979602909e-05, | |
| "loss": 0.3721, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8805389505645697, | |
| "grad_norm": 0.19821172952651978, | |
| "learning_rate": 8.910336818630765e-05, | |
| "loss": 0.3879, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.888129803586678, | |
| "grad_norm": 0.16163092851638794, | |
| "learning_rate": 8.88267179853033e-05, | |
| "loss": 0.4246, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8957206566087864, | |
| "grad_norm": 0.23528441786766052, | |
| "learning_rate": 8.854704076132157e-05, | |
| "loss": 0.3265, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9033115096308948, | |
| "grad_norm": 0.06464134156703949, | |
| "learning_rate": 8.826435831866184e-05, | |
| "loss": 0.4734, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.9109023626530032, | |
| "grad_norm": 0.13423138856887817, | |
| "learning_rate": 8.797869269591748e-05, | |
| "loss": 0.3093, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9184932156751114, | |
| "grad_norm": 0.18938276171684265, | |
| "learning_rate": 8.769006616425761e-05, | |
| "loss": 0.445, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9260840686972198, | |
| "grad_norm": 0.1079874336719513, | |
| "learning_rate": 8.739850122569091e-05, | |
| "loss": 0.3719, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9336749217193282, | |
| "grad_norm": 0.18556272983551025, | |
| "learning_rate": 8.710402061131113e-05, | |
| "loss": 0.3832, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9412657747414366, | |
| "grad_norm": 0.14005886018276215, | |
| "learning_rate": 8.680664727952513e-05, | |
| "loss": 0.4454, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.948856627763545, | |
| "grad_norm": 0.19735439121723175, | |
| "learning_rate": 8.650640441426274e-05, | |
| "loss": 0.3663, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9564474807856533, | |
| "grad_norm": 0.46979591250419617, | |
| "learning_rate": 8.620331542316955e-05, | |
| "loss": 0.358, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9640383338077616, | |
| "grad_norm": 0.11601805686950684, | |
| "learning_rate": 8.589740393578179e-05, | |
| "loss": 0.343, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.97162918682987, | |
| "grad_norm": 0.1647682785987854, | |
| "learning_rate": 8.558869380168423e-05, | |
| "loss": 0.3644, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9792200398519784, | |
| "grad_norm": 0.15652813017368317, | |
| "learning_rate": 8.527720908865075e-05, | |
| "loss": 0.3628, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9868108928740867, | |
| "grad_norm": 0.09970453381538391, | |
| "learning_rate": 8.496297408076798e-05, | |
| "loss": 0.3634, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9944017458961951, | |
| "grad_norm": 0.19404280185699463, | |
| "learning_rate": 8.464601327654207e-05, | |
| "loss": 0.3638, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.0015181706044216, | |
| "grad_norm": 0.07966156303882599, | |
| "learning_rate": 8.432635138698873e-05, | |
| "loss": 0.3526, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.00910902362653, | |
| "grad_norm": 0.16148845851421356, | |
| "learning_rate": 8.400401333370662e-05, | |
| "loss": 0.449, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.0166998766486384, | |
| "grad_norm": 0.16588972508907318, | |
| "learning_rate": 8.367902424693456e-05, | |
| "loss": 0.2698, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0242907296707469, | |
| "grad_norm": 0.15112628042697906, | |
| "learning_rate": 8.335140946359216e-05, | |
| "loss": 0.526, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.0318815826928551, | |
| "grad_norm": 0.544543981552124, | |
| "learning_rate": 8.302119452530459e-05, | |
| "loss": 0.3187, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0394724357149634, | |
| "grad_norm": 0.09538525342941284, | |
| "learning_rate": 8.268840517641123e-05, | |
| "loss": 0.3222, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.0470632887370719, | |
| "grad_norm": 0.21151971817016602, | |
| "learning_rate": 8.235306736195865e-05, | |
| "loss": 0.4085, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0546541417591802, | |
| "grad_norm": 0.07254460453987122, | |
| "learning_rate": 8.201520722567783e-05, | |
| "loss": 0.3323, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.0622449947812886, | |
| "grad_norm": 0.2756185829639435, | |
| "learning_rate": 8.16748511079459e-05, | |
| "loss": 0.2916, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.069835847803397, | |
| "grad_norm": 0.07430507987737656, | |
| "learning_rate": 8.13320255437327e-05, | |
| "loss": 0.3352, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.0774267008255052, | |
| "grad_norm": 0.07954972982406616, | |
| "learning_rate": 8.098675726053187e-05, | |
| "loss": 0.2949, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.0850175538476137, | |
| "grad_norm": 0.10341721773147583, | |
| "learning_rate": 8.06390731762773e-05, | |
| "loss": 0.3474, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.092608406869722, | |
| "grad_norm": 0.08508666604757309, | |
| "learning_rate": 8.028900039724443e-05, | |
| "loss": 0.3108, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.1001992598918304, | |
| "grad_norm": 0.14489281177520752, | |
| "learning_rate": 7.993656621593699e-05, | |
| "loss": 0.3777, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.1077901129139387, | |
| "grad_norm": 0.07771521061658859, | |
| "learning_rate": 7.958179810895922e-05, | |
| "loss": 0.2953, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.115380965936047, | |
| "grad_norm": 0.19902044534683228, | |
| "learning_rate": 7.92247237348738e-05, | |
| "loss": 0.3823, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.1229718189581555, | |
| "grad_norm": 0.07754581421613693, | |
| "learning_rate": 7.886537093204539e-05, | |
| "loss": 0.4141, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1305626719802637, | |
| "grad_norm": 0.08018027245998383, | |
| "learning_rate": 7.850376771647038e-05, | |
| "loss": 0.3957, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.1381535250023722, | |
| "grad_norm": 0.09160657972097397, | |
| "learning_rate": 7.813994227959274e-05, | |
| "loss": 0.2976, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1457443780244805, | |
| "grad_norm": 0.11231184750795364, | |
| "learning_rate": 7.777392298610594e-05, | |
| "loss": 0.3355, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.1533352310465887, | |
| "grad_norm": 0.14084723591804504, | |
| "learning_rate": 7.740573837174184e-05, | |
| "loss": 0.3374, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.1609260840686972, | |
| "grad_norm": 0.083552785217762, | |
| "learning_rate": 7.703541714104577e-05, | |
| "loss": 0.424, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.1685169370908055, | |
| "grad_norm": 0.07307687401771545, | |
| "learning_rate": 7.666298816513879e-05, | |
| "loss": 0.414, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.176107790112914, | |
| "grad_norm": 0.17594853043556213, | |
| "learning_rate": 7.628848047946675e-05, | |
| "loss": 0.3479, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.1836986431350223, | |
| "grad_norm": 0.13654448091983795, | |
| "learning_rate": 7.59119232815366e-05, | |
| "loss": 0.3557, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1912894961571308, | |
| "grad_norm": 0.13807976245880127, | |
| "learning_rate": 7.553334592864021e-05, | |
| "loss": 0.3345, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.198880349179239, | |
| "grad_norm": 0.6585047841072083, | |
| "learning_rate": 7.515277793556545e-05, | |
| "loss": 0.3335, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2064712022013473, | |
| "grad_norm": 0.20239250361919403, | |
| "learning_rate": 7.477024897229526e-05, | |
| "loss": 0.3291, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.2140620552234558, | |
| "grad_norm": 0.13821843266487122, | |
| "learning_rate": 7.43857888616944e-05, | |
| "loss": 0.2888, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.221652908245564, | |
| "grad_norm": 0.2950344383716583, | |
| "learning_rate": 7.399942757718455e-05, | |
| "loss": 0.3215, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.2292437612676725, | |
| "grad_norm": 0.19034092128276825, | |
| "learning_rate": 7.361119524040733e-05, | |
| "loss": 0.3507, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.2368346142897808, | |
| "grad_norm": 0.11198897659778595, | |
| "learning_rate": 7.322112211887612e-05, | |
| "loss": 0.4081, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.244425467311889, | |
| "grad_norm": 0.08805972337722778, | |
| "learning_rate": 7.282923862361612e-05, | |
| "loss": 0.3463, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.2520163203339976, | |
| "grad_norm": 0.1584472358226776, | |
| "learning_rate": 7.243557530679367e-05, | |
| "loss": 0.3533, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.2596071733561058, | |
| "grad_norm": 0.1389380395412445, | |
| "learning_rate": 7.204016285933415e-05, | |
| "loss": 0.3155, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2671980263782143, | |
| "grad_norm": 0.21182285249233246, | |
| "learning_rate": 7.164303210852934e-05, | |
| "loss": 0.3131, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.2747888794003226, | |
| "grad_norm": 0.13253886997699738, | |
| "learning_rate": 7.124421401563403e-05, | |
| "loss": 0.3223, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2823797324224309, | |
| "grad_norm": 0.07222673296928406, | |
| "learning_rate": 7.084373967345217e-05, | |
| "loss": 0.3448, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.2899705854445394, | |
| "grad_norm": 0.137168288230896, | |
| "learning_rate": 7.044164030391286e-05, | |
| "loss": 0.3283, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2975614384666476, | |
| "grad_norm": 0.11420410126447678, | |
| "learning_rate": 7.003794725563617e-05, | |
| "loss": 0.2769, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.3051522914887561, | |
| "grad_norm": 0.13328640162944794, | |
| "learning_rate": 6.963269200148915e-05, | |
| "loss": 0.3813, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.3127431445108644, | |
| "grad_norm": 0.2379562109708786, | |
| "learning_rate": 6.922590613613211e-05, | |
| "loss": 0.3248, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.3203339975329729, | |
| "grad_norm": 0.10297714918851852, | |
| "learning_rate": 6.881762137355545e-05, | |
| "loss": 0.3201, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.3279248505550811, | |
| "grad_norm": 0.10009097307920456, | |
| "learning_rate": 6.840786954460713e-05, | |
| "loss": 0.3096, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.3355157035771894, | |
| "grad_norm": 0.09942924976348877, | |
| "learning_rate": 6.799668259451114e-05, | |
| "loss": 0.3267, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.343106556599298, | |
| "grad_norm": 0.13200506567955017, | |
| "learning_rate": 6.758409258037683e-05, | |
| "loss": 0.3253, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.3506974096214062, | |
| "grad_norm": 0.10787712782621384, | |
| "learning_rate": 6.717013166869982e-05, | |
| "loss": 0.3455, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3582882626435144, | |
| "grad_norm": 0.14178533852100372, | |
| "learning_rate": 6.675483213285412e-05, | |
| "loss": 0.4479, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.365879115665623, | |
| "grad_norm": 0.10185932368040085, | |
| "learning_rate": 6.633822635057609e-05, | |
| "loss": 0.3427, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3734699686877314, | |
| "grad_norm": 0.07396234571933746, | |
| "learning_rate": 6.592034680144008e-05, | |
| "loss": 0.3582, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.3810608217098397, | |
| "grad_norm": 0.2012874335050583, | |
| "learning_rate": 6.550122606432639e-05, | |
| "loss": 0.3648, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.388651674731948, | |
| "grad_norm": 0.09361989051103592, | |
| "learning_rate": 6.508089681488126e-05, | |
| "loss": 0.3302, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.3962425277540564, | |
| "grad_norm": 0.1503518670797348, | |
| "learning_rate": 6.465939182296943e-05, | |
| "loss": 0.3528, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.4038333807761647, | |
| "grad_norm": 0.06482362002134323, | |
| "learning_rate": 6.42367439501193e-05, | |
| "loss": 0.3154, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.411424233798273, | |
| "grad_norm": 0.17551842331886292, | |
| "learning_rate": 6.381298614696094e-05, | |
| "loss": 0.2889, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.4190150868203815, | |
| "grad_norm": 0.09612052887678146, | |
| "learning_rate": 6.338815145065727e-05, | |
| "loss": 0.2348, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.4266059398424897, | |
| "grad_norm": 0.20216520130634308, | |
| "learning_rate": 6.296227298232834e-05, | |
| "loss": 0.3356, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.4341967928645982, | |
| "grad_norm": 0.11559966951608658, | |
| "learning_rate": 6.253538394446914e-05, | |
| "loss": 0.3254, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.4417876458867065, | |
| "grad_norm": 0.1131603941321373, | |
| "learning_rate": 6.210751761836105e-05, | |
| "loss": 0.2883, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.449378498908815, | |
| "grad_norm": 0.3986540734767914, | |
| "learning_rate": 6.167870736147713e-05, | |
| "loss": 0.3648, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.4569693519309233, | |
| "grad_norm": 0.3160419464111328, | |
| "learning_rate": 6.124898660488158e-05, | |
| "loss": 0.3506, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4645602049530315, | |
| "grad_norm": 0.09258974343538284, | |
| "learning_rate": 6.081838885062328e-05, | |
| "loss": 0.3197, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.47215105797514, | |
| "grad_norm": 0.14045535027980804, | |
| "learning_rate": 6.038694766912394e-05, | |
| "loss": 0.3634, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.4797419109972483, | |
| "grad_norm": 0.1499331146478653, | |
| "learning_rate": 5.9954696696560844e-05, | |
| "loss": 0.2952, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.4873327640193565, | |
| "grad_norm": 0.12873616814613342, | |
| "learning_rate": 5.952166963224451e-05, | |
| "loss": 0.3923, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.494923617041465, | |
| "grad_norm": 0.05709124356508255, | |
| "learning_rate": 5.908790023599144e-05, | |
| "loss": 0.3321, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.5025144700635735, | |
| "grad_norm": 0.07169659435749054, | |
| "learning_rate": 5.865342232549204e-05, | |
| "loss": 0.3493, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.5101053230856818, | |
| "grad_norm": 0.11258374899625778, | |
| "learning_rate": 5.8218269773674195e-05, | |
| "loss": 0.3039, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.51769617610779, | |
| "grad_norm": 0.0858062282204628, | |
| "learning_rate": 5.778247650606242e-05, | |
| "loss": 0.2878, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.51769617610779, | |
| "eval_loss": 0.5372660756111145, | |
| "eval_runtime": 246.0786, | |
| "eval_samples_per_second": 9.521, | |
| "eval_steps_per_second": 9.521, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5252870291298986, | |
| "grad_norm": 0.09433668851852417, | |
| "learning_rate": 5.734607649813297e-05, | |
| "loss": 0.2803, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.5328778821520068, | |
| "grad_norm": 0.18208527565002441, | |
| "learning_rate": 5.6909103772665015e-05, | |
| "loss": 0.3029, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.540468735174115, | |
| "grad_norm": 0.1644572615623474, | |
| "learning_rate": 5.647159239708809e-05, | |
| "loss": 0.2799, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.5480595881962236, | |
| "grad_norm": 0.13228443264961243, | |
| "learning_rate": 5.603357648082622e-05, | |
| "loss": 0.3209, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.555650441218332, | |
| "grad_norm": 0.08718783408403397, | |
| "learning_rate": 5.559509017263862e-05, | |
| "loss": 0.3287, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.5632412942404401, | |
| "grad_norm": 0.20194876194000244, | |
| "learning_rate": 5.515616765795736e-05, | |
| "loss": 0.3244, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.5708321472625486, | |
| "grad_norm": 0.319332093000412, | |
| "learning_rate": 5.471684315622218e-05, | |
| "loss": 0.2786, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.578423000284657, | |
| "grad_norm": 0.07592841982841492, | |
| "learning_rate": 5.42771509182127e-05, | |
| "loss": 0.404, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.5860138533067654, | |
| "grad_norm": 0.07565618306398392, | |
| "learning_rate": 5.383712522337817e-05, | |
| "loss": 0.3701, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.5936047063288736, | |
| "grad_norm": 0.14734983444213867, | |
| "learning_rate": 5.339680037716487e-05, | |
| "loss": 0.2615, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6011955593509821, | |
| "grad_norm": 0.12061991542577744, | |
| "learning_rate": 5.2956210708341657e-05, | |
| "loss": 0.4049, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.6087864123730904, | |
| "grad_norm": 0.10585551708936691, | |
| "learning_rate": 5.2515390566323574e-05, | |
| "loss": 0.2495, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.6163772653951987, | |
| "grad_norm": 0.19110319018363953, | |
| "learning_rate": 5.2074374318493915e-05, | |
| "loss": 0.2675, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.6239681184173071, | |
| "grad_norm": 0.09903518110513687, | |
| "learning_rate": 5.163319634752484e-05, | |
| "loss": 0.3348, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.6315589714394156, | |
| "grad_norm": 0.21369488537311554, | |
| "learning_rate": 5.119189104869683e-05, | |
| "loss": 0.3546, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.639149824461524, | |
| "grad_norm": 0.051785871386528015, | |
| "learning_rate": 5.075049282721715e-05, | |
| "loss": 0.2549, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6467406774836322, | |
| "grad_norm": 0.08246368169784546, | |
| "learning_rate": 5.030903609553753e-05, | |
| "loss": 0.4519, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.6543315305057407, | |
| "grad_norm": 0.16289927065372467, | |
| "learning_rate": 4.9867555270671296e-05, | |
| "loss": 0.323, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.661922383527849, | |
| "grad_norm": 0.09280192106962204, | |
| "learning_rate": 4.942608477151013e-05, | |
| "loss": 0.3735, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.6695132365499572, | |
| "grad_norm": 0.16873304545879364, | |
| "learning_rate": 4.898465901614072e-05, | |
| "loss": 0.3447, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6771040895720657, | |
| "grad_norm": 0.12987276911735535, | |
| "learning_rate": 4.8543312419161396e-05, | |
| "loss": 0.3249, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.6846949425941742, | |
| "grad_norm": 0.08189664781093597, | |
| "learning_rate": 4.8102079388999106e-05, | |
| "loss": 0.3743, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.6922857956162822, | |
| "grad_norm": 0.14963483810424805, | |
| "learning_rate": 4.7660994325226906e-05, | |
| "loss": 0.2677, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.6998766486383907, | |
| "grad_norm": 0.08255660533905029, | |
| "learning_rate": 4.722009161588199e-05, | |
| "loss": 0.2741, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.7074675016604992, | |
| "grad_norm": 0.14731058478355408, | |
| "learning_rate": 4.67794056347848e-05, | |
| "loss": 0.3997, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.7150583546826075, | |
| "grad_norm": 0.0929296612739563, | |
| "learning_rate": 4.63389707388591e-05, | |
| "loss": 0.3421, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.7226492077047157, | |
| "grad_norm": 0.1865396797657013, | |
| "learning_rate": 4.589882126545352e-05, | |
| "loss": 0.3422, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.7302400607268242, | |
| "grad_norm": 0.22962845861911774, | |
| "learning_rate": 4.545899152966439e-05, | |
| "loss": 0.3236, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.7378309137489325, | |
| "grad_norm": 0.07792218029499054, | |
| "learning_rate": 4.501951582166061e-05, | |
| "loss": 0.3311, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.7454217667710408, | |
| "grad_norm": 0.09238504618406296, | |
| "learning_rate": 4.458042840401019e-05, | |
| "loss": 0.3918, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.7530126197931493, | |
| "grad_norm": 0.3078314960002899, | |
| "learning_rate": 4.414176350900909e-05, | |
| "loss": 0.2956, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.7606034728152578, | |
| "grad_norm": 0.2649385929107666, | |
| "learning_rate": 4.370355533601249e-05, | |
| "loss": 0.3388, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.768194325837366, | |
| "grad_norm": 0.19793756306171417, | |
| "learning_rate": 4.3265838048768334e-05, | |
| "loss": 0.2931, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.7757851788594743, | |
| "grad_norm": 0.06952016800642014, | |
| "learning_rate": 4.282864577275403e-05, | |
| "loss": 0.2593, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.7833760318815828, | |
| "grad_norm": 0.08573190122842789, | |
| "learning_rate": 4.2392012592515785e-05, | |
| "loss": 0.3249, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.790966884903691, | |
| "grad_norm": 0.17260582745075226, | |
| "learning_rate": 4.195597254901147e-05, | |
| "loss": 0.2837, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.7985577379257993, | |
| "grad_norm": 0.14388003945350647, | |
| "learning_rate": 4.152055963695652e-05, | |
| "loss": 0.3507, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.8061485909479078, | |
| "grad_norm": 0.09567199647426605, | |
| "learning_rate": 4.1085807802173796e-05, | |
| "loss": 0.3801, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.8137394439700163, | |
| "grad_norm": 0.052392423152923584, | |
| "learning_rate": 4.065175093894694e-05, | |
| "loss": 0.3151, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.8213302969921243, | |
| "grad_norm": 0.1070229709148407, | |
| "learning_rate": 4.021842288737797e-05, | |
| "loss": 0.341, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.8289211500142328, | |
| "grad_norm": 0.19132941961288452, | |
| "learning_rate": 3.978585743074909e-05, | |
| "loss": 0.2827, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.8365120030363413, | |
| "grad_norm": 0.14613649249076843, | |
| "learning_rate": 3.9354088292888716e-05, | |
| "loss": 0.3597, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.8441028560584496, | |
| "grad_norm": 0.3268943727016449, | |
| "learning_rate": 3.89231491355424e-05, | |
| "loss": 0.3455, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.8516937090805579, | |
| "grad_norm": 0.15382400155067444, | |
| "learning_rate": 3.849307355574844e-05, | |
| "loss": 0.3158, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.8592845621026663, | |
| "grad_norm": 0.09326305985450745, | |
| "learning_rate": 3.80638950832186e-05, | |
| "loss": 0.286, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.8668754151247746, | |
| "grad_norm": 0.32320550084114075, | |
| "learning_rate": 3.763564717772397e-05, | |
| "loss": 0.3412, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.8744662681468829, | |
| "grad_norm": 0.08675755560398102, | |
| "learning_rate": 3.720836322648652e-05, | |
| "loss": 0.2893, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.8820571211689914, | |
| "grad_norm": 0.19032827019691467, | |
| "learning_rate": 3.6782076541576e-05, | |
| "loss": 0.307, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.8896479741910999, | |
| "grad_norm": 0.09857955574989319, | |
| "learning_rate": 3.635682035731292e-05, | |
| "loss": 0.2604, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.8972388272132081, | |
| "grad_norm": 0.05677346885204315, | |
| "learning_rate": 3.5932627827677536e-05, | |
| "loss": 0.3252, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.9048296802353164, | |
| "grad_norm": 0.10897451639175415, | |
| "learning_rate": 3.550953202372503e-05, | |
| "loss": 0.4204, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.912420533257425, | |
| "grad_norm": 0.12246419489383698, | |
| "learning_rate": 3.5087565931007316e-05, | |
| "loss": 0.3542, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.9200113862795332, | |
| "grad_norm": 0.09597641974687576, | |
| "learning_rate": 3.466676244700127e-05, | |
| "loss": 0.2573, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.9276022393016414, | |
| "grad_norm": 0.09933824092149734, | |
| "learning_rate": 3.4247154378544087e-05, | |
| "loss": 0.2844, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.93519309232375, | |
| "grad_norm": 0.25755199790000916, | |
| "learning_rate": 3.382877443927549e-05, | |
| "loss": 0.2835, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.9427839453458582, | |
| "grad_norm": 0.15798412263393402, | |
| "learning_rate": 3.34116552470874e-05, | |
| "loss": 0.3273, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.9503747983679665, | |
| "grad_norm": 0.2535303235054016, | |
| "learning_rate": 3.299582932158085e-05, | |
| "loss": 0.2858, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.957965651390075, | |
| "grad_norm": 0.06856276839971542, | |
| "learning_rate": 3.258132908153074e-05, | |
| "loss": 0.3002, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.9655565044121834, | |
| "grad_norm": 0.13859562575817108, | |
| "learning_rate": 3.216818684235844e-05, | |
| "loss": 0.2957, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.9731473574342917, | |
| "grad_norm": 0.1521427035331726, | |
| "learning_rate": 3.1756434813612266e-05, | |
| "loss": 0.2836, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.9807382104564, | |
| "grad_norm": 0.15251488983631134, | |
| "learning_rate": 3.134610509645655e-05, | |
| "loss": 0.3087, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.9883290634785085, | |
| "grad_norm": 0.15713337063789368, | |
| "learning_rate": 3.093722968116873e-05, | |
| "loss": 0.2847, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.9959199165006167, | |
| "grad_norm": 0.07002197206020355, | |
| "learning_rate": 3.052984044464548e-05, | |
| "loss": 0.281, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.003036341208843, | |
| "grad_norm": 0.11551013588905334, | |
| "learning_rate": 3.012396914791744e-05, | |
| "loss": 0.362, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.0106271942309517, | |
| "grad_norm": 0.18070758879184723, | |
| "learning_rate": 2.971964743367309e-05, | |
| "loss": 0.2798, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.01821804725306, | |
| "grad_norm": 0.12462901324033737, | |
| "learning_rate": 2.9316906823791753e-05, | |
| "loss": 0.2792, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.0258089002751682, | |
| "grad_norm": 0.1277770847082138, | |
| "learning_rate": 2.8915778716886093e-05, | |
| "loss": 0.3313, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.0333997532972767, | |
| "grad_norm": 0.08295406401157379, | |
| "learning_rate": 2.8516294385854282e-05, | |
| "loss": 0.3326, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.0409906063193852, | |
| "grad_norm": 0.22416232526302338, | |
| "learning_rate": 2.811848497544175e-05, | |
| "loss": 0.2702, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.0485814593414937, | |
| "grad_norm": 0.23556384444236755, | |
| "learning_rate": 2.7722381499813233e-05, | |
| "loss": 0.3045, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.0561723123636018, | |
| "grad_norm": 0.21475334465503693, | |
| "learning_rate": 2.7328014840134658e-05, | |
| "loss": 0.3191, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.0637631653857103, | |
| "grad_norm": 0.14084239304065704, | |
| "learning_rate": 2.693541574216575e-05, | |
| "loss": 0.3398, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.0713540184078187, | |
| "grad_norm": 0.1728208363056183, | |
| "learning_rate": 2.6544614813862857e-05, | |
| "loss": 0.2602, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.078944871429927, | |
| "grad_norm": 0.12706124782562256, | |
| "learning_rate": 2.61556425229928e-05, | |
| "loss": 0.2541, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.0865357244520353, | |
| "grad_norm": 0.12707297503948212, | |
| "learning_rate": 2.5768529194757474e-05, | |
| "loss": 0.242, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.0941265774741438, | |
| "grad_norm": 0.04940659552812576, | |
| "learning_rate": 2.538330500942963e-05, | |
| "loss": 0.3645, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.101717430496252, | |
| "grad_norm": 0.16009534895420074, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 0.2547, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.1093082835183603, | |
| "grad_norm": 0.3543836176395416, | |
| "learning_rate": 2.4618644049835782e-05, | |
| "loss": 0.3354, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.116899136540469, | |
| "grad_norm": 0.10430457442998886, | |
| "learning_rate": 2.4239266890350904e-05, | |
| "loss": 0.283, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.1244899895625773, | |
| "grad_norm": 0.14833734929561615, | |
| "learning_rate": 2.3861898098688057e-05, | |
| "loss": 0.3425, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.1320808425846853, | |
| "grad_norm": 0.272958904504776, | |
| "learning_rate": 2.3486567095412864e-05, | |
| "loss": 0.3378, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.139671695606794, | |
| "grad_norm": 0.10397733747959137, | |
| "learning_rate": 2.3113303142220094e-05, | |
| "loss": 0.2804, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.1472625486289023, | |
| "grad_norm": 0.10424613207578659, | |
| "learning_rate": 2.2742135339652398e-05, | |
| "loss": 0.2909, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.1548534016510104, | |
| "grad_norm": 0.1362823098897934, | |
| "learning_rate": 2.2373092624831566e-05, | |
| "loss": 0.2652, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.162444254673119, | |
| "grad_norm": 0.13256202638149261, | |
| "learning_rate": 2.2006203769202482e-05, | |
| "loss": 0.2872, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.1700351076952273, | |
| "grad_norm": 0.30884748697280884, | |
| "learning_rate": 2.1641497376290122e-05, | |
| "loss": 0.3112, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.177625960717336, | |
| "grad_norm": 0.1805962771177292, | |
| "learning_rate": 2.1279001879469424e-05, | |
| "loss": 0.2451, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 2.185216813739444, | |
| "grad_norm": 0.11278484761714935, | |
| "learning_rate": 2.0918745539748686e-05, | |
| "loss": 0.2565, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.1928076667615524, | |
| "grad_norm": 0.15021051466464996, | |
| "learning_rate": 2.0560756443566148e-05, | |
| "loss": 0.2576, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 2.200398519783661, | |
| "grad_norm": 0.12693342566490173, | |
| "learning_rate": 2.0205062500600446e-05, | |
| "loss": 0.2776, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.207989372805769, | |
| "grad_norm": 0.08629851788282394, | |
| "learning_rate": 1.985169144159456e-05, | |
| "loss": 0.3334, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.2155802258278774, | |
| "grad_norm": 0.10110223293304443, | |
| "learning_rate": 1.9500670816193968e-05, | |
| "loss": 0.2517, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.223171078849986, | |
| "grad_norm": 0.27294450998306274, | |
| "learning_rate": 1.9152027990798748e-05, | |
| "loss": 0.3316, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 2.230761931872094, | |
| "grad_norm": 0.22429701685905457, | |
| "learning_rate": 1.8805790146430063e-05, | |
| "loss": 0.2648, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.2383527848942024, | |
| "grad_norm": 0.13777467608451843, | |
| "learning_rate": 1.8461984276611084e-05, | |
| "loss": 0.346, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 2.245943637916311, | |
| "grad_norm": 0.30676957964897156, | |
| "learning_rate": 1.8120637185262418e-05, | |
| "loss": 0.4114, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.2535344909384194, | |
| "grad_norm": 0.15495023131370544, | |
| "learning_rate": 1.778177548461255e-05, | |
| "loss": 0.3818, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.2611253439605274, | |
| "grad_norm": 0.13254746794700623, | |
| "learning_rate": 1.744542559312295e-05, | |
| "loss": 0.3267, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.268716196982636, | |
| "grad_norm": 0.1586749106645584, | |
| "learning_rate": 1.7111613733428522e-05, | |
| "loss": 0.2979, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 2.2763070500047444, | |
| "grad_norm": 0.14043815433979034, | |
| "learning_rate": 1.6780365930293163e-05, | |
| "loss": 0.2736, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.2763070500047444, | |
| "eval_loss": 0.4912321865558624, | |
| "eval_runtime": 247.8053, | |
| "eval_samples_per_second": 9.455, | |
| "eval_steps_per_second": 9.455, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.2838979030268525, | |
| "grad_norm": 0.08288119733333588, | |
| "learning_rate": 1.6451708008580907e-05, | |
| "loss": 0.2354, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 2.291488756048961, | |
| "grad_norm": 0.23708699643611908, | |
| "learning_rate": 1.6125665591242433e-05, | |
| "loss": 0.3625, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.2990796090710695, | |
| "grad_norm": 0.030745351687073708, | |
| "learning_rate": 1.58022640973175e-05, | |
| "loss": 0.266, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.3066704620931775, | |
| "grad_norm": 0.11616528779268265, | |
| "learning_rate": 1.5481528739953272e-05, | |
| "loss": 0.3145, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.314261315115286, | |
| "grad_norm": 0.25872117280960083, | |
| "learning_rate": 1.5163484524438516e-05, | |
| "loss": 0.2714, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 2.3218521681373945, | |
| "grad_norm": 0.10753320902585983, | |
| "learning_rate": 1.4848156246254263e-05, | |
| "loss": 0.2801, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.329443021159503, | |
| "grad_norm": 0.18508432805538177, | |
| "learning_rate": 1.4535568489140594e-05, | |
| "loss": 0.2678, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 2.337033874181611, | |
| "grad_norm": 0.18051201105117798, | |
| "learning_rate": 1.422574562318007e-05, | |
| "loss": 0.2981, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.3446247272037195, | |
| "grad_norm": 0.12133996188640594, | |
| "learning_rate": 1.3918711802897789e-05, | |
| "loss": 0.286, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.352215580225828, | |
| "grad_norm": 0.13994358479976654, | |
| "learning_rate": 1.3614490965378257e-05, | |
| "loss": 0.274, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.359806433247936, | |
| "grad_norm": 0.1304178684949875, | |
| "learning_rate": 1.3313106828399147e-05, | |
| "loss": 0.3661, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 2.3673972862700445, | |
| "grad_norm": 0.30562353134155273, | |
| "learning_rate": 1.3014582888582232e-05, | |
| "loss": 0.2732, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.374988139292153, | |
| "grad_norm": 0.03359885513782501, | |
| "learning_rate": 1.271894241956158e-05, | |
| "loss": 0.286, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 2.3825789923142615, | |
| "grad_norm": 0.33226141333580017, | |
| "learning_rate": 1.2426208470168965e-05, | |
| "loss": 0.2796, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.3901698453363696, | |
| "grad_norm": 0.07864666730165482, | |
| "learning_rate": 1.213640386263708e-05, | |
| "loss": 0.2444, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.397760698358478, | |
| "grad_norm": 0.12879331409931183, | |
| "learning_rate": 1.1849551190820125e-05, | |
| "loss": 0.3099, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.4053515513805865, | |
| "grad_norm": 0.12465628236532211, | |
| "learning_rate": 1.156567281843241e-05, | |
| "loss": 0.2591, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 2.4129424044026946, | |
| "grad_norm": 0.22187581658363342, | |
| "learning_rate": 1.1284790877304807e-05, | |
| "loss": 0.3553, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.420533257424803, | |
| "grad_norm": 0.16689549386501312, | |
| "learning_rate": 1.1006927265659334e-05, | |
| "loss": 0.2627, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 2.4281241104469116, | |
| "grad_norm": 0.12732334434986115, | |
| "learning_rate": 1.0732103646401847e-05, | |
| "loss": 0.2605, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.43571496346902, | |
| "grad_norm": 0.051952481269836426, | |
| "learning_rate": 1.0460341445433191e-05, | |
| "loss": 0.339, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.443305816491128, | |
| "grad_norm": 0.13088718056678772, | |
| "learning_rate": 1.0191661849978824e-05, | |
| "loss": 0.3354, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.4508966695132366, | |
| "grad_norm": 0.12184888124465942, | |
| "learning_rate": 9.926085806936918e-06, | |
| "loss": 0.3414, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 2.458487522535345, | |
| "grad_norm": 0.10065177083015442, | |
| "learning_rate": 9.663634021245399e-06, | |
| "loss": 0.2189, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.466078375557453, | |
| "grad_norm": 0.09585690498352051, | |
| "learning_rate": 9.404326954267634e-06, | |
| "loss": 0.1881, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 2.4736692285795616, | |
| "grad_norm": 0.23267728090286255, | |
| "learning_rate": 9.148184822197282e-06, | |
| "loss": 0.3027, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.48126008160167, | |
| "grad_norm": 0.15521545708179474, | |
| "learning_rate": 8.895227594482164e-06, | |
| "loss": 0.2562, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.488850934623778, | |
| "grad_norm": 0.12793651223182678, | |
| "learning_rate": 8.645474992267438e-06, | |
| "loss": 0.244, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.4964417876458866, | |
| "grad_norm": 0.2494293749332428, | |
| "learning_rate": 8.398946486858029e-06, | |
| "loss": 0.3017, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 2.504032640667995, | |
| "grad_norm": 0.15070238709449768, | |
| "learning_rate": 8.155661298200635e-06, | |
| "loss": 0.2686, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.511623493690103, | |
| "grad_norm": 0.11418969929218292, | |
| "learning_rate": 7.915638393385316e-06, | |
| "loss": 0.3993, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 2.5192143467122117, | |
| "grad_norm": 0.23196111619472504, | |
| "learning_rate": 7.67889648516672e-06, | |
| "loss": 0.316, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.52680519973432, | |
| "grad_norm": 0.13710691034793854, | |
| "learning_rate": 7.445454030505256e-06, | |
| "loss": 0.3051, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.5343960527564287, | |
| "grad_norm": 0.12727871537208557, | |
| "learning_rate": 7.215329229128076e-06, | |
| "loss": 0.2903, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.5419869057785367, | |
| "grad_norm": 0.11962519586086273, | |
| "learning_rate": 6.988540022110235e-06, | |
| "loss": 0.3204, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 2.549577758800645, | |
| "grad_norm": 0.14515872299671173, | |
| "learning_rate": 6.765104090475932e-06, | |
| "loss": 0.3121, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.5571686118227537, | |
| "grad_norm": 0.2983834445476532, | |
| "learning_rate": 6.545038853820096e-06, | |
| "loss": 0.3353, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 2.5647594648448617, | |
| "grad_norm": 0.1801896095275879, | |
| "learning_rate": 6.328361468950267e-06, | |
| "loss": 0.4262, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.57235031786697, | |
| "grad_norm": 0.07517129927873611, | |
| "learning_rate": 6.115088828549003e-06, | |
| "loss": 0.2728, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 2.5799411708890787, | |
| "grad_norm": 0.19445601105690002, | |
| "learning_rate": 5.905237559856974e-06, | |
| "loss": 0.3221, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.587532023911187, | |
| "grad_norm": 0.10309174656867981, | |
| "learning_rate": 5.698824023376531e-06, | |
| "loss": 0.2641, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 2.5951228769332952, | |
| "grad_norm": 0.14000511169433594, | |
| "learning_rate": 5.495864311596343e-06, | |
| "loss": 0.2636, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.6027137299554037, | |
| "grad_norm": 0.11765392124652863, | |
| "learning_rate": 5.296374247736635e-06, | |
| "loss": 0.2529, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 2.6103045829775122, | |
| "grad_norm": 0.1269364356994629, | |
| "learning_rate": 5.100369384515735e-06, | |
| "loss": 0.2678, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.6178954359996203, | |
| "grad_norm": 0.06566209346055984, | |
| "learning_rate": 4.907865002937406e-06, | |
| "loss": 0.26, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 2.6254862890217288, | |
| "grad_norm": 0.07295841723680496, | |
| "learning_rate": 4.718876111099613e-06, | |
| "loss": 0.2591, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.6330771420438372, | |
| "grad_norm": 0.17099271714687347, | |
| "learning_rate": 4.533417443024374e-06, | |
| "loss": 0.24, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 2.6406679950659457, | |
| "grad_norm": 0.16684836149215698, | |
| "learning_rate": 4.351503457509093e-06, | |
| "loss": 0.2542, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.648258848088054, | |
| "grad_norm": 0.1620602011680603, | |
| "learning_rate": 4.17314833699935e-06, | |
| "loss": 0.2782, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 2.6558497011101623, | |
| "grad_norm": 0.1975618451833725, | |
| "learning_rate": 3.998365986483143e-06, | |
| "loss": 0.2835, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.6634405541322708, | |
| "grad_norm": 0.04293884336948395, | |
| "learning_rate": 3.827170032406851e-06, | |
| "loss": 0.2825, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 2.671031407154379, | |
| "grad_norm": 0.09690513461828232, | |
| "learning_rate": 3.6595738216128994e-06, | |
| "loss": 0.3289, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.6786222601764873, | |
| "grad_norm": 0.029426729306578636, | |
| "learning_rate": 3.495590420299194e-06, | |
| "loss": 0.2791, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 2.686213113198596, | |
| "grad_norm": 0.11573446542024612, | |
| "learning_rate": 3.335232613000433e-06, | |
| "loss": 0.3137, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.6938039662207043, | |
| "grad_norm": 0.26567184925079346, | |
| "learning_rate": 3.1785129015914296e-06, | |
| "loss": 0.3154, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 2.7013948192428123, | |
| "grad_norm": 0.13201837241649628, | |
| "learning_rate": 3.0254435043124083e-06, | |
| "loss": 0.3069, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.708985672264921, | |
| "grad_norm": 0.29772883653640747, | |
| "learning_rate": 2.876036354816436e-06, | |
| "loss": 0.2662, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 2.716576525287029, | |
| "grad_norm": 0.08354216068983078, | |
| "learning_rate": 2.73030310123909e-06, | |
| "loss": 0.3624, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.7241673783091374, | |
| "grad_norm": 0.259625107049942, | |
| "learning_rate": 2.5882551052902883e-06, | |
| "loss": 0.2486, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 2.731758231331246, | |
| "grad_norm": 0.22710494697093964, | |
| "learning_rate": 2.4499034413685395e-06, | |
| "loss": 0.3115, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.7393490843533543, | |
| "grad_norm": 0.07519204914569855, | |
| "learning_rate": 2.3152588956975365e-06, | |
| "loss": 0.2924, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 2.746939937375463, | |
| "grad_norm": 0.18423676490783691, | |
| "learning_rate": 2.184331965485259e-06, | |
| "loss": 0.3002, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.754530790397571, | |
| "grad_norm": 0.1939702183008194, | |
| "learning_rate": 2.057132858105548e-06, | |
| "loss": 0.3338, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 2.7621216434196794, | |
| "grad_norm": 0.20939414203166962, | |
| "learning_rate": 1.93367149030238e-06, | |
| "loss": 0.2796, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.7697124964417874, | |
| "grad_norm": 0.26544660329818726, | |
| "learning_rate": 1.813957487416651e-06, | |
| "loss": 0.3037, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 2.777303349463896, | |
| "grad_norm": 0.20273450016975403, | |
| "learning_rate": 1.6980001826358226e-06, | |
| "loss": 0.2634, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.7848942024860044, | |
| "grad_norm": 0.28037548065185547, | |
| "learning_rate": 1.585808616266271e-06, | |
| "loss": 0.4404, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 2.792485055508113, | |
| "grad_norm": 0.18899406492710114, | |
| "learning_rate": 1.4773915350284772e-06, | |
| "loss": 0.3411, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.800075908530221, | |
| "grad_norm": 0.3918057382106781, | |
| "learning_rate": 1.3727573913751013e-06, | |
| "loss": 0.2854, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 2.8076667615523294, | |
| "grad_norm": 0.1583373099565506, | |
| "learning_rate": 1.2719143428320256e-06, | |
| "loss": 0.2327, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.815257614574438, | |
| "grad_norm": 0.1468050479888916, | |
| "learning_rate": 1.1748702513623922e-06, | |
| "loss": 0.2568, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 2.822848467596546, | |
| "grad_norm": 0.14241372048854828, | |
| "learning_rate": 1.0816326827536083e-06, | |
| "loss": 0.2705, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.8304393206186544, | |
| "grad_norm": 0.1886911541223526, | |
| "learning_rate": 9.92208906027564e-07, | |
| "loss": 0.2619, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 2.838030173640763, | |
| "grad_norm": 0.16523589193820953, | |
| "learning_rate": 9.066058928738797e-07, | |
| "loss": 0.2628, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.8456210266628714, | |
| "grad_norm": 0.10537305474281311, | |
| "learning_rate": 8.248303171063898e-07, | |
| "loss": 0.2101, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 2.8532118796849795, | |
| "grad_norm": 0.08789675682783127, | |
| "learning_rate": 7.468885541428438e-07, | |
| "loss": 0.2569, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.860802732707088, | |
| "grad_norm": 0.13445930182933807, | |
| "learning_rate": 6.727866805078531e-07, | |
| "loss": 0.2946, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 2.8683935857291964, | |
| "grad_norm": 0.1602274775505066, | |
| "learning_rate": 6.02530473359153e-07, | |
| "loss": 0.2775, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.8759844387513045, | |
| "grad_norm": 0.12329945713281631, | |
| "learning_rate": 5.361254100371915e-07, | |
| "loss": 0.2592, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 2.883575291773413, | |
| "grad_norm": 0.07469865679740906, | |
| "learning_rate": 4.7357666763814813e-07, | |
| "loss": 0.3769, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.8911661447955215, | |
| "grad_norm": 0.11232876777648926, | |
| "learning_rate": 4.148891226102347e-07, | |
| "loss": 0.2907, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 2.89875699781763, | |
| "grad_norm": 0.20298312604427338, | |
| "learning_rate": 3.600673503736107e-07, | |
| "loss": 0.2756, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.906347850839738, | |
| "grad_norm": 0.24480293691158295, | |
| "learning_rate": 3.0911562496358517e-07, | |
| "loss": 0.2864, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 2.9139387038618465, | |
| "grad_norm": 0.21817973256111145, | |
| "learning_rate": 2.620379186974664e-07, | |
| "loss": 0.2548, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.921529556883955, | |
| "grad_norm": 0.09590750187635422, | |
| "learning_rate": 2.1883790186483234e-07, | |
| "loss": 0.296, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 2.929120409906063, | |
| "grad_norm": 0.07791176438331604, | |
| "learning_rate": 1.79518942441409e-07, | |
| "loss": 0.2834, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.9367112629281715, | |
| "grad_norm": 0.1533590704202652, | |
| "learning_rate": 1.440841058264808e-07, | |
| "loss": 0.3561, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 2.94430211595028, | |
| "grad_norm": 0.046548765152692795, | |
| "learning_rate": 1.1253615460391498e-07, | |
| "loss": 0.227, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.9518929689723885, | |
| "grad_norm": 0.30706334114074707, | |
| "learning_rate": 8.48775483267783e-08, | |
| "loss": 0.2787, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 2.9594838219944966, | |
| "grad_norm": 0.24995796382427216, | |
| "learning_rate": 6.111044332557936e-08, | |
| "loss": 0.2536, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.967074675016605, | |
| "grad_norm": 0.1498573124408722, | |
| "learning_rate": 4.123669254017526e-08, | |
| "loss": 0.3097, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 2.974665528038713, | |
| "grad_norm": 0.1821010708808899, | |
| "learning_rate": 2.525784537528164e-08, | |
| "loss": 0.2778, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.9822563810608216, | |
| "grad_norm": 0.07815810292959213, | |
| "learning_rate": 1.3175147579702619e-08, | |
| "loss": 0.2583, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 2.98984723408293, | |
| "grad_norm": 0.16738541424274445, | |
| "learning_rate": 4.9895411492084656e-09, | |
| "loss": 0.2417, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.9974380871050386, | |
| "grad_norm": 0.0907551497220993, | |
| "learning_rate": 7.016642530777162e-10, | |
| "loss": 0.308, | |
| "step": 1975 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1977, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.852198258216796e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |