Aiview_Qwen3-0.6B / trainer_state.json
SUNRISEEVERYDAY's picture
Upload folder using huggingface_hub
8d64b07 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1977,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007590853022108359,
"grad_norm": 0.3484795391559601,
"learning_rate": 2.0202020202020206e-06,
"loss": 1.3363,
"step": 5
},
{
"epoch": 0.015181706044216719,
"grad_norm": 0.22640460729599,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.2804,
"step": 10
},
{
"epoch": 0.02277255906632508,
"grad_norm": 0.6324455142021179,
"learning_rate": 7.0707070707070704e-06,
"loss": 1.4853,
"step": 15
},
{
"epoch": 0.030363412088433437,
"grad_norm": 0.22372472286224365,
"learning_rate": 9.595959595959595e-06,
"loss": 1.4012,
"step": 20
},
{
"epoch": 0.0379542651105418,
"grad_norm": 0.36963725090026855,
"learning_rate": 1.2121212121212122e-05,
"loss": 1.3461,
"step": 25
},
{
"epoch": 0.04554511813265016,
"grad_norm": 0.6136270761489868,
"learning_rate": 1.4646464646464647e-05,
"loss": 1.2617,
"step": 30
},
{
"epoch": 0.053135971154758516,
"grad_norm": 0.20270676910877228,
"learning_rate": 1.7171717171717173e-05,
"loss": 1.1316,
"step": 35
},
{
"epoch": 0.060726824176866874,
"grad_norm": 0.7699366807937622,
"learning_rate": 1.9696969696969697e-05,
"loss": 1.0233,
"step": 40
},
{
"epoch": 0.06831767719897523,
"grad_norm": 0.1857442855834961,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.9781,
"step": 45
},
{
"epoch": 0.0759085302210836,
"grad_norm": 0.18821729719638824,
"learning_rate": 2.474747474747475e-05,
"loss": 0.9322,
"step": 50
},
{
"epoch": 0.08349938324319195,
"grad_norm": 0.2683129906654358,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.921,
"step": 55
},
{
"epoch": 0.09109023626530031,
"grad_norm": 0.17556847631931305,
"learning_rate": 2.9797979797979796e-05,
"loss": 0.7989,
"step": 60
},
{
"epoch": 0.09868108928740867,
"grad_norm": 0.13853396475315094,
"learning_rate": 3.232323232323233e-05,
"loss": 0.8641,
"step": 65
},
{
"epoch": 0.10627194230951703,
"grad_norm": 0.20103883743286133,
"learning_rate": 3.484848484848485e-05,
"loss": 0.8642,
"step": 70
},
{
"epoch": 0.1138627953316254,
"grad_norm": 0.3428489565849304,
"learning_rate": 3.7373737373737376e-05,
"loss": 0.9104,
"step": 75
},
{
"epoch": 0.12145364835373375,
"grad_norm": 0.17233586311340332,
"learning_rate": 3.98989898989899e-05,
"loss": 0.7537,
"step": 80
},
{
"epoch": 0.1290445013758421,
"grad_norm": 0.15092326700687408,
"learning_rate": 4.242424242424243e-05,
"loss": 0.6822,
"step": 85
},
{
"epoch": 0.13663535439795046,
"grad_norm": 0.2307768166065216,
"learning_rate": 4.494949494949495e-05,
"loss": 0.6993,
"step": 90
},
{
"epoch": 0.14422620742005882,
"grad_norm": 0.30504322052001953,
"learning_rate": 4.7474747474747476e-05,
"loss": 0.6094,
"step": 95
},
{
"epoch": 0.1518170604421672,
"grad_norm": 0.4996664524078369,
"learning_rate": 5e-05,
"loss": 0.742,
"step": 100
},
{
"epoch": 0.15940791346427555,
"grad_norm": 0.14200103282928467,
"learning_rate": 5.2525252525252536e-05,
"loss": 0.5918,
"step": 105
},
{
"epoch": 0.1669987664863839,
"grad_norm": 0.15189217031002045,
"learning_rate": 5.5050505050505056e-05,
"loss": 0.5884,
"step": 110
},
{
"epoch": 0.17458961950849228,
"grad_norm": 0.25142595171928406,
"learning_rate": 5.757575757575758e-05,
"loss": 0.4829,
"step": 115
},
{
"epoch": 0.18218047253060063,
"grad_norm": 0.2258097231388092,
"learning_rate": 6.01010101010101e-05,
"loss": 0.6297,
"step": 120
},
{
"epoch": 0.18977132555270898,
"grad_norm": 0.3805074095726013,
"learning_rate": 6.262626262626264e-05,
"loss": 0.5932,
"step": 125
},
{
"epoch": 0.19736217857481733,
"grad_norm": 0.24002455174922943,
"learning_rate": 6.515151515151516e-05,
"loss": 0.6145,
"step": 130
},
{
"epoch": 0.2049530315969257,
"grad_norm": 0.2202799916267395,
"learning_rate": 6.767676767676769e-05,
"loss": 0.5086,
"step": 135
},
{
"epoch": 0.21254388461903406,
"grad_norm": 0.2224930077791214,
"learning_rate": 7.020202020202021e-05,
"loss": 0.4833,
"step": 140
},
{
"epoch": 0.22013473764114241,
"grad_norm": 0.2609390914440155,
"learning_rate": 7.272727272727273e-05,
"loss": 0.5377,
"step": 145
},
{
"epoch": 0.2277255906632508,
"grad_norm": 0.10272058844566345,
"learning_rate": 7.525252525252525e-05,
"loss": 0.438,
"step": 150
},
{
"epoch": 0.23531644368535914,
"grad_norm": 0.289846807718277,
"learning_rate": 7.777777777777778e-05,
"loss": 0.5454,
"step": 155
},
{
"epoch": 0.2429072967074675,
"grad_norm": 0.23176854848861694,
"learning_rate": 8.03030303030303e-05,
"loss": 0.5868,
"step": 160
},
{
"epoch": 0.25049814972957585,
"grad_norm": 0.24970294535160065,
"learning_rate": 8.282828282828283e-05,
"loss": 0.5515,
"step": 165
},
{
"epoch": 0.2580890027516842,
"grad_norm": 0.3183782398700714,
"learning_rate": 8.535353535353535e-05,
"loss": 0.4979,
"step": 170
},
{
"epoch": 0.2656798557737926,
"grad_norm": 0.3254294693470001,
"learning_rate": 8.787878787878789e-05,
"loss": 0.4923,
"step": 175
},
{
"epoch": 0.27327070879590093,
"grad_norm": 0.41256552934646606,
"learning_rate": 9.040404040404041e-05,
"loss": 0.4654,
"step": 180
},
{
"epoch": 0.2808615618180093,
"grad_norm": 0.21282930672168732,
"learning_rate": 9.292929292929293e-05,
"loss": 0.4763,
"step": 185
},
{
"epoch": 0.28845241484011763,
"grad_norm": 0.13954608142375946,
"learning_rate": 9.545454545454546e-05,
"loss": 0.4511,
"step": 190
},
{
"epoch": 0.296043267862226,
"grad_norm": 0.33055710792541504,
"learning_rate": 9.797979797979798e-05,
"loss": 0.4938,
"step": 195
},
{
"epoch": 0.3036341208843344,
"grad_norm": 0.2626458704471588,
"learning_rate": 9.999992203714313e-05,
"loss": 0.4624,
"step": 200
},
{
"epoch": 0.3112249739064427,
"grad_norm": 0.2240830659866333,
"learning_rate": 9.999719336268101e-05,
"loss": 0.4803,
"step": 205
},
{
"epoch": 0.3188158269285511,
"grad_norm": 0.20912809669971466,
"learning_rate": 9.999056678850014e-05,
"loss": 0.4875,
"step": 210
},
{
"epoch": 0.3264066799506595,
"grad_norm": 0.3838413655757904,
"learning_rate": 9.9980042831224e-05,
"loss": 0.6046,
"step": 215
},
{
"epoch": 0.3339975329727678,
"grad_norm": 0.2766261398792267,
"learning_rate": 9.996562231132523e-05,
"loss": 0.4424,
"step": 220
},
{
"epoch": 0.3415883859948762,
"grad_norm": 0.24665701389312744,
"learning_rate": 9.994730635306174e-05,
"loss": 0.427,
"step": 225
},
{
"epoch": 0.34917923901698456,
"grad_norm": 0.09618967026472092,
"learning_rate": 9.992509638438907e-05,
"loss": 0.4912,
"step": 230
},
{
"epoch": 0.3567700920390929,
"grad_norm": 0.09648390114307404,
"learning_rate": 9.9898994136849e-05,
"loss": 0.5337,
"step": 235
},
{
"epoch": 0.36436094506120126,
"grad_norm": 0.38818395137786865,
"learning_rate": 9.986900164543467e-05,
"loss": 0.5244,
"step": 240
},
{
"epoch": 0.37195179808330964,
"grad_norm": 0.1550498604774475,
"learning_rate": 9.983512124843177e-05,
"loss": 0.4584,
"step": 245
},
{
"epoch": 0.37954265110541796,
"grad_norm": 0.19514597952365875,
"learning_rate": 9.97973555872364e-05,
"loss": 0.4544,
"step": 250
},
{
"epoch": 0.38713350412752634,
"grad_norm": 0.229110985994339,
"learning_rate": 9.975570760614902e-05,
"loss": 0.4177,
"step": 255
},
{
"epoch": 0.39472435714963466,
"grad_norm": 0.157244473695755,
"learning_rate": 9.971018055214496e-05,
"loss": 0.435,
"step": 260
},
{
"epoch": 0.40231521017174304,
"grad_norm": 0.1850701868534088,
"learning_rate": 9.966077797462129e-05,
"loss": 0.4151,
"step": 265
},
{
"epoch": 0.4099060631938514,
"grad_norm": 0.29395997524261475,
"learning_rate": 9.960750372512007e-05,
"loss": 0.413,
"step": 270
},
{
"epoch": 0.41749691621595975,
"grad_norm": 0.13114891946315765,
"learning_rate": 9.955036195702806e-05,
"loss": 0.4548,
"step": 275
},
{
"epoch": 0.4250877692380681,
"grad_norm": 0.1735813319683075,
"learning_rate": 9.948935712525299e-05,
"loss": 0.4605,
"step": 280
},
{
"epoch": 0.4326786222601765,
"grad_norm": 0.3185722529888153,
"learning_rate": 9.942449398587616e-05,
"loss": 0.556,
"step": 285
},
{
"epoch": 0.44026947528228483,
"grad_norm": 0.09232311695814133,
"learning_rate": 9.935577759578167e-05,
"loss": 0.3765,
"step": 290
},
{
"epoch": 0.4478603283043932,
"grad_norm": 0.1775842010974884,
"learning_rate": 9.928321331226219e-05,
"loss": 0.5862,
"step": 295
},
{
"epoch": 0.4554511813265016,
"grad_norm": 0.17507719993591309,
"learning_rate": 9.92068067926013e-05,
"loss": 0.5954,
"step": 300
},
{
"epoch": 0.4630420343486099,
"grad_norm": 0.1269141137599945,
"learning_rate": 9.912656399363238e-05,
"loss": 0.4592,
"step": 305
},
{
"epoch": 0.4706328873707183,
"grad_norm": 0.2526402771472931,
"learning_rate": 9.904249117127428e-05,
"loss": 0.4719,
"step": 310
},
{
"epoch": 0.47822374039282667,
"grad_norm": 0.09530569612979889,
"learning_rate": 9.895459488004356e-05,
"loss": 0.3575,
"step": 315
},
{
"epoch": 0.485814593414935,
"grad_norm": 0.1187816858291626,
"learning_rate": 9.886288197254341e-05,
"loss": 0.4027,
"step": 320
},
{
"epoch": 0.49340544643704337,
"grad_norm": 0.1907995641231537,
"learning_rate": 9.876735959892953e-05,
"loss": 0.3528,
"step": 325
},
{
"epoch": 0.5009962994591517,
"grad_norm": 0.13331271708011627,
"learning_rate": 9.866803520635262e-05,
"loss": 0.4405,
"step": 330
},
{
"epoch": 0.5085871524812601,
"grad_norm": 0.08372137695550919,
"learning_rate": 9.856491653837776e-05,
"loss": 0.4757,
"step": 335
},
{
"epoch": 0.5161780055033685,
"grad_norm": 0.2553092837333679,
"learning_rate": 9.84580116343808e-05,
"loss": 0.3777,
"step": 340
},
{
"epoch": 0.5237688585254768,
"grad_norm": 0.36431610584259033,
"learning_rate": 9.834732882892146e-05,
"loss": 0.4213,
"step": 345
},
{
"epoch": 0.5313597115475852,
"grad_norm": 0.19020870327949524,
"learning_rate": 9.823287675109365e-05,
"loss": 0.485,
"step": 350
},
{
"epoch": 0.5389505645696935,
"grad_norm": 0.2752246856689453,
"learning_rate": 9.811466432385267e-05,
"loss": 0.4534,
"step": 355
},
{
"epoch": 0.5465414175918019,
"grad_norm": 0.13446789979934692,
"learning_rate": 9.79927007633196e-05,
"loss": 0.3736,
"step": 360
},
{
"epoch": 0.5541322706139102,
"grad_norm": 0.3601730167865753,
"learning_rate": 9.786699557806274e-05,
"loss": 0.3864,
"step": 365
},
{
"epoch": 0.5617231236360186,
"grad_norm": 0.28543752431869507,
"learning_rate": 9.773755856835631e-05,
"loss": 0.4452,
"step": 370
},
{
"epoch": 0.569313976658127,
"grad_norm": 0.17038831114768982,
"learning_rate": 9.760439982541646e-05,
"loss": 0.4114,
"step": 375
},
{
"epoch": 0.5769048296802353,
"grad_norm": 0.08299808204174042,
"learning_rate": 9.746752973061446e-05,
"loss": 0.4296,
"step": 380
},
{
"epoch": 0.5844956827023436,
"grad_norm": 0.23236173391342163,
"learning_rate": 9.732695895466735e-05,
"loss": 0.4577,
"step": 385
},
{
"epoch": 0.592086535724452,
"grad_norm": 0.15669377148151398,
"learning_rate": 9.718269845680606e-05,
"loss": 0.4083,
"step": 390
},
{
"epoch": 0.5996773887465604,
"grad_norm": 0.13645882904529572,
"learning_rate": 9.7034759483921e-05,
"loss": 0.3649,
"step": 395
},
{
"epoch": 0.6072682417686688,
"grad_norm": 0.05155817046761513,
"learning_rate": 9.688315356968521e-05,
"loss": 0.3618,
"step": 400
},
{
"epoch": 0.6148590947907772,
"grad_norm": 0.11633949726819992,
"learning_rate": 9.672789253365515e-05,
"loss": 0.4005,
"step": 405
},
{
"epoch": 0.6224499478128854,
"grad_norm": 0.18201440572738647,
"learning_rate": 9.656898848034926e-05,
"loss": 0.4173,
"step": 410
},
{
"epoch": 0.6300408008349938,
"grad_norm": 0.16578234732151031,
"learning_rate": 9.640645379830424e-05,
"loss": 0.3481,
"step": 415
},
{
"epoch": 0.6376316538571022,
"grad_norm": 0.2896673083305359,
"learning_rate": 9.624030115910919e-05,
"loss": 0.3855,
"step": 420
},
{
"epoch": 0.6452225068792106,
"grad_norm": 0.24677187204360962,
"learning_rate": 9.607054351641778e-05,
"loss": 0.4146,
"step": 425
},
{
"epoch": 0.652813359901319,
"grad_norm": 0.0908162072300911,
"learning_rate": 9.589719410493822e-05,
"loss": 0.4494,
"step": 430
},
{
"epoch": 0.6604042129234273,
"grad_norm": 0.07764331251382828,
"learning_rate": 9.572026643940161e-05,
"loss": 0.3942,
"step": 435
},
{
"epoch": 0.6679950659455356,
"grad_norm": 0.20974747836589813,
"learning_rate": 9.553977431350816e-05,
"loss": 0.385,
"step": 440
},
{
"epoch": 0.675585918967644,
"grad_norm": 0.17720717191696167,
"learning_rate": 9.535573179885191e-05,
"loss": 0.368,
"step": 445
},
{
"epoch": 0.6831767719897524,
"grad_norm": 0.24612246453762054,
"learning_rate": 9.516815324382356e-05,
"loss": 0.3519,
"step": 450
},
{
"epoch": 0.6907676250118607,
"grad_norm": 0.2337830364704132,
"learning_rate": 9.497705327249198e-05,
"loss": 0.4045,
"step": 455
},
{
"epoch": 0.6983584780339691,
"grad_norm": 0.20068958401679993,
"learning_rate": 9.47824467834639e-05,
"loss": 0.3792,
"step": 460
},
{
"epoch": 0.7059493310560774,
"grad_norm": 0.2409381866455078,
"learning_rate": 9.45843489487226e-05,
"loss": 0.3082,
"step": 465
},
{
"epoch": 0.7135401840781858,
"grad_norm": 0.36345911026000977,
"learning_rate": 9.438277521244489e-05,
"loss": 0.3868,
"step": 470
},
{
"epoch": 0.7211310371002941,
"grad_norm": 0.24875889718532562,
"learning_rate": 9.417774128979706e-05,
"loss": 0.3718,
"step": 475
},
{
"epoch": 0.7287218901224025,
"grad_norm": 0.11670625954866409,
"learning_rate": 9.39692631657098e-05,
"loss": 0.4664,
"step": 480
},
{
"epoch": 0.7363127431445109,
"grad_norm": 0.1045941710472107,
"learning_rate": 9.375735709363189e-05,
"loss": 0.4966,
"step": 485
},
{
"epoch": 0.7439035961666193,
"grad_norm": 0.22171644866466522,
"learning_rate": 9.354203959426304e-05,
"loss": 0.3507,
"step": 490
},
{
"epoch": 0.7514944491887275,
"grad_norm": 0.20189598202705383,
"learning_rate": 9.332332745426596e-05,
"loss": 0.5197,
"step": 495
},
{
"epoch": 0.7590853022108359,
"grad_norm": 0.1306353658437729,
"learning_rate": 9.310123772495756e-05,
"loss": 0.4887,
"step": 500
},
{
"epoch": 0.7590853022108359,
"eval_loss": 0.6389759182929993,
"eval_runtime": 241.6392,
"eval_samples_per_second": 9.696,
"eval_steps_per_second": 9.696,
"step": 500
},
{
"epoch": 0.7666761552329443,
"grad_norm": 0.1597890555858612,
"learning_rate": 9.28757877209796e-05,
"loss": 0.4177,
"step": 505
},
{
"epoch": 0.7742670082550527,
"grad_norm": 0.07365977764129639,
"learning_rate": 9.264699501894887e-05,
"loss": 0.3627,
"step": 510
},
{
"epoch": 0.7818578612771611,
"grad_norm": 0.18954086303710938,
"learning_rate": 9.241487745608681e-05,
"loss": 0.3507,
"step": 515
},
{
"epoch": 0.7894487142992693,
"grad_norm": 0.0904935896396637,
"learning_rate": 9.217945312882888e-05,
"loss": 0.4617,
"step": 520
},
{
"epoch": 0.7970395673213777,
"grad_norm": 0.0969192162156105,
"learning_rate": 9.19407403914137e-05,
"loss": 0.4213,
"step": 525
},
{
"epoch": 0.8046304203434861,
"grad_norm": 0.28966036438941956,
"learning_rate": 9.16987578544522e-05,
"loss": 0.3588,
"step": 530
},
{
"epoch": 0.8122212733655945,
"grad_norm": 0.305105984210968,
"learning_rate": 9.145352438347662e-05,
"loss": 0.3805,
"step": 535
},
{
"epoch": 0.8198121263877028,
"grad_norm": 0.09918319433927536,
"learning_rate": 9.12050590974697e-05,
"loss": 0.4213,
"step": 540
},
{
"epoch": 0.8274029794098112,
"grad_norm": 0.2274104654788971,
"learning_rate": 9.095338136737413e-05,
"loss": 0.3999,
"step": 545
},
{
"epoch": 0.8349938324319195,
"grad_norm": 0.4053689241409302,
"learning_rate": 9.069851081458242e-05,
"loss": 0.3478,
"step": 550
},
{
"epoch": 0.8425846854540279,
"grad_norm": 0.1169477105140686,
"learning_rate": 9.044046730940705e-05,
"loss": 0.3894,
"step": 555
},
{
"epoch": 0.8501755384761362,
"grad_norm": 0.13794220983982086,
"learning_rate": 9.017927096953139e-05,
"loss": 0.4559,
"step": 560
},
{
"epoch": 0.8577663914982446,
"grad_norm": 0.059803109616041183,
"learning_rate": 8.991494215844132e-05,
"loss": 0.4398,
"step": 565
},
{
"epoch": 0.865357244520353,
"grad_norm": 0.1535421907901764,
"learning_rate": 8.964750148383756e-05,
"loss": 0.396,
"step": 570
},
{
"epoch": 0.8729480975424614,
"grad_norm": 0.07398340851068497,
"learning_rate": 8.937696979602909e-05,
"loss": 0.3721,
"step": 575
},
{
"epoch": 0.8805389505645697,
"grad_norm": 0.19821172952651978,
"learning_rate": 8.910336818630765e-05,
"loss": 0.3879,
"step": 580
},
{
"epoch": 0.888129803586678,
"grad_norm": 0.16163092851638794,
"learning_rate": 8.88267179853033e-05,
"loss": 0.4246,
"step": 585
},
{
"epoch": 0.8957206566087864,
"grad_norm": 0.23528441786766052,
"learning_rate": 8.854704076132157e-05,
"loss": 0.3265,
"step": 590
},
{
"epoch": 0.9033115096308948,
"grad_norm": 0.06464134156703949,
"learning_rate": 8.826435831866184e-05,
"loss": 0.4734,
"step": 595
},
{
"epoch": 0.9109023626530032,
"grad_norm": 0.13423138856887817,
"learning_rate": 8.797869269591748e-05,
"loss": 0.3093,
"step": 600
},
{
"epoch": 0.9184932156751114,
"grad_norm": 0.18938276171684265,
"learning_rate": 8.769006616425761e-05,
"loss": 0.445,
"step": 605
},
{
"epoch": 0.9260840686972198,
"grad_norm": 0.1079874336719513,
"learning_rate": 8.739850122569091e-05,
"loss": 0.3719,
"step": 610
},
{
"epoch": 0.9336749217193282,
"grad_norm": 0.18556272983551025,
"learning_rate": 8.710402061131113e-05,
"loss": 0.3832,
"step": 615
},
{
"epoch": 0.9412657747414366,
"grad_norm": 0.14005886018276215,
"learning_rate": 8.680664727952513e-05,
"loss": 0.4454,
"step": 620
},
{
"epoch": 0.948856627763545,
"grad_norm": 0.19735439121723175,
"learning_rate": 8.650640441426274e-05,
"loss": 0.3663,
"step": 625
},
{
"epoch": 0.9564474807856533,
"grad_norm": 0.46979591250419617,
"learning_rate": 8.620331542316955e-05,
"loss": 0.358,
"step": 630
},
{
"epoch": 0.9640383338077616,
"grad_norm": 0.11601805686950684,
"learning_rate": 8.589740393578179e-05,
"loss": 0.343,
"step": 635
},
{
"epoch": 0.97162918682987,
"grad_norm": 0.1647682785987854,
"learning_rate": 8.558869380168423e-05,
"loss": 0.3644,
"step": 640
},
{
"epoch": 0.9792200398519784,
"grad_norm": 0.15652813017368317,
"learning_rate": 8.527720908865075e-05,
"loss": 0.3628,
"step": 645
},
{
"epoch": 0.9868108928740867,
"grad_norm": 0.09970453381538391,
"learning_rate": 8.496297408076798e-05,
"loss": 0.3634,
"step": 650
},
{
"epoch": 0.9944017458961951,
"grad_norm": 0.19404280185699463,
"learning_rate": 8.464601327654207e-05,
"loss": 0.3638,
"step": 655
},
{
"epoch": 1.0015181706044216,
"grad_norm": 0.07966156303882599,
"learning_rate": 8.432635138698873e-05,
"loss": 0.3526,
"step": 660
},
{
"epoch": 1.00910902362653,
"grad_norm": 0.16148845851421356,
"learning_rate": 8.400401333370662e-05,
"loss": 0.449,
"step": 665
},
{
"epoch": 1.0166998766486384,
"grad_norm": 0.16588972508907318,
"learning_rate": 8.367902424693456e-05,
"loss": 0.2698,
"step": 670
},
{
"epoch": 1.0242907296707469,
"grad_norm": 0.15112628042697906,
"learning_rate": 8.335140946359216e-05,
"loss": 0.526,
"step": 675
},
{
"epoch": 1.0318815826928551,
"grad_norm": 0.544543981552124,
"learning_rate": 8.302119452530459e-05,
"loss": 0.3187,
"step": 680
},
{
"epoch": 1.0394724357149634,
"grad_norm": 0.09538525342941284,
"learning_rate": 8.268840517641123e-05,
"loss": 0.3222,
"step": 685
},
{
"epoch": 1.0470632887370719,
"grad_norm": 0.21151971817016602,
"learning_rate": 8.235306736195865e-05,
"loss": 0.4085,
"step": 690
},
{
"epoch": 1.0546541417591802,
"grad_norm": 0.07254460453987122,
"learning_rate": 8.201520722567783e-05,
"loss": 0.3323,
"step": 695
},
{
"epoch": 1.0622449947812886,
"grad_norm": 0.2756185829639435,
"learning_rate": 8.16748511079459e-05,
"loss": 0.2916,
"step": 700
},
{
"epoch": 1.069835847803397,
"grad_norm": 0.07430507987737656,
"learning_rate": 8.13320255437327e-05,
"loss": 0.3352,
"step": 705
},
{
"epoch": 1.0774267008255052,
"grad_norm": 0.07954972982406616,
"learning_rate": 8.098675726053187e-05,
"loss": 0.2949,
"step": 710
},
{
"epoch": 1.0850175538476137,
"grad_norm": 0.10341721773147583,
"learning_rate": 8.06390731762773e-05,
"loss": 0.3474,
"step": 715
},
{
"epoch": 1.092608406869722,
"grad_norm": 0.08508666604757309,
"learning_rate": 8.028900039724443e-05,
"loss": 0.3108,
"step": 720
},
{
"epoch": 1.1001992598918304,
"grad_norm": 0.14489281177520752,
"learning_rate": 7.993656621593699e-05,
"loss": 0.3777,
"step": 725
},
{
"epoch": 1.1077901129139387,
"grad_norm": 0.07771521061658859,
"learning_rate": 7.958179810895922e-05,
"loss": 0.2953,
"step": 730
},
{
"epoch": 1.115380965936047,
"grad_norm": 0.19902044534683228,
"learning_rate": 7.92247237348738e-05,
"loss": 0.3823,
"step": 735
},
{
"epoch": 1.1229718189581555,
"grad_norm": 0.07754581421613693,
"learning_rate": 7.886537093204539e-05,
"loss": 0.4141,
"step": 740
},
{
"epoch": 1.1305626719802637,
"grad_norm": 0.08018027245998383,
"learning_rate": 7.850376771647038e-05,
"loss": 0.3957,
"step": 745
},
{
"epoch": 1.1381535250023722,
"grad_norm": 0.09160657972097397,
"learning_rate": 7.813994227959274e-05,
"loss": 0.2976,
"step": 750
},
{
"epoch": 1.1457443780244805,
"grad_norm": 0.11231184750795364,
"learning_rate": 7.777392298610594e-05,
"loss": 0.3355,
"step": 755
},
{
"epoch": 1.1533352310465887,
"grad_norm": 0.14084723591804504,
"learning_rate": 7.740573837174184e-05,
"loss": 0.3374,
"step": 760
},
{
"epoch": 1.1609260840686972,
"grad_norm": 0.083552785217762,
"learning_rate": 7.703541714104577e-05,
"loss": 0.424,
"step": 765
},
{
"epoch": 1.1685169370908055,
"grad_norm": 0.07307687401771545,
"learning_rate": 7.666298816513879e-05,
"loss": 0.414,
"step": 770
},
{
"epoch": 1.176107790112914,
"grad_norm": 0.17594853043556213,
"learning_rate": 7.628848047946675e-05,
"loss": 0.3479,
"step": 775
},
{
"epoch": 1.1836986431350223,
"grad_norm": 0.13654448091983795,
"learning_rate": 7.59119232815366e-05,
"loss": 0.3557,
"step": 780
},
{
"epoch": 1.1912894961571308,
"grad_norm": 0.13807976245880127,
"learning_rate": 7.553334592864021e-05,
"loss": 0.3345,
"step": 785
},
{
"epoch": 1.198880349179239,
"grad_norm": 0.6585047841072083,
"learning_rate": 7.515277793556545e-05,
"loss": 0.3335,
"step": 790
},
{
"epoch": 1.2064712022013473,
"grad_norm": 0.20239250361919403,
"learning_rate": 7.477024897229526e-05,
"loss": 0.3291,
"step": 795
},
{
"epoch": 1.2140620552234558,
"grad_norm": 0.13821843266487122,
"learning_rate": 7.43857888616944e-05,
"loss": 0.2888,
"step": 800
},
{
"epoch": 1.221652908245564,
"grad_norm": 0.2950344383716583,
"learning_rate": 7.399942757718455e-05,
"loss": 0.3215,
"step": 805
},
{
"epoch": 1.2292437612676725,
"grad_norm": 0.19034092128276825,
"learning_rate": 7.361119524040733e-05,
"loss": 0.3507,
"step": 810
},
{
"epoch": 1.2368346142897808,
"grad_norm": 0.11198897659778595,
"learning_rate": 7.322112211887612e-05,
"loss": 0.4081,
"step": 815
},
{
"epoch": 1.244425467311889,
"grad_norm": 0.08805972337722778,
"learning_rate": 7.282923862361612e-05,
"loss": 0.3463,
"step": 820
},
{
"epoch": 1.2520163203339976,
"grad_norm": 0.1584472358226776,
"learning_rate": 7.243557530679367e-05,
"loss": 0.3533,
"step": 825
},
{
"epoch": 1.2596071733561058,
"grad_norm": 0.1389380395412445,
"learning_rate": 7.204016285933415e-05,
"loss": 0.3155,
"step": 830
},
{
"epoch": 1.2671980263782143,
"grad_norm": 0.21182285249233246,
"learning_rate": 7.164303210852934e-05,
"loss": 0.3131,
"step": 835
},
{
"epoch": 1.2747888794003226,
"grad_norm": 0.13253886997699738,
"learning_rate": 7.124421401563403e-05,
"loss": 0.3223,
"step": 840
},
{
"epoch": 1.2823797324224309,
"grad_norm": 0.07222673296928406,
"learning_rate": 7.084373967345217e-05,
"loss": 0.3448,
"step": 845
},
{
"epoch": 1.2899705854445394,
"grad_norm": 0.137168288230896,
"learning_rate": 7.044164030391286e-05,
"loss": 0.3283,
"step": 850
},
{
"epoch": 1.2975614384666476,
"grad_norm": 0.11420410126447678,
"learning_rate": 7.003794725563617e-05,
"loss": 0.2769,
"step": 855
},
{
"epoch": 1.3051522914887561,
"grad_norm": 0.13328640162944794,
"learning_rate": 6.963269200148915e-05,
"loss": 0.3813,
"step": 860
},
{
"epoch": 1.3127431445108644,
"grad_norm": 0.2379562109708786,
"learning_rate": 6.922590613613211e-05,
"loss": 0.3248,
"step": 865
},
{
"epoch": 1.3203339975329729,
"grad_norm": 0.10297714918851852,
"learning_rate": 6.881762137355545e-05,
"loss": 0.3201,
"step": 870
},
{
"epoch": 1.3279248505550811,
"grad_norm": 0.10009097307920456,
"learning_rate": 6.840786954460713e-05,
"loss": 0.3096,
"step": 875
},
{
"epoch": 1.3355157035771894,
"grad_norm": 0.09942924976348877,
"learning_rate": 6.799668259451114e-05,
"loss": 0.3267,
"step": 880
},
{
"epoch": 1.343106556599298,
"grad_norm": 0.13200506567955017,
"learning_rate": 6.758409258037683e-05,
"loss": 0.3253,
"step": 885
},
{
"epoch": 1.3506974096214062,
"grad_norm": 0.10787712782621384,
"learning_rate": 6.717013166869982e-05,
"loss": 0.3455,
"step": 890
},
{
"epoch": 1.3582882626435144,
"grad_norm": 0.14178533852100372,
"learning_rate": 6.675483213285412e-05,
"loss": 0.4479,
"step": 895
},
{
"epoch": 1.365879115665623,
"grad_norm": 0.10185932368040085,
"learning_rate": 6.633822635057609e-05,
"loss": 0.3427,
"step": 900
},
{
"epoch": 1.3734699686877314,
"grad_norm": 0.07396234571933746,
"learning_rate": 6.592034680144008e-05,
"loss": 0.3582,
"step": 905
},
{
"epoch": 1.3810608217098397,
"grad_norm": 0.2012874335050583,
"learning_rate": 6.550122606432639e-05,
"loss": 0.3648,
"step": 910
},
{
"epoch": 1.388651674731948,
"grad_norm": 0.09361989051103592,
"learning_rate": 6.508089681488126e-05,
"loss": 0.3302,
"step": 915
},
{
"epoch": 1.3962425277540564,
"grad_norm": 0.1503518670797348,
"learning_rate": 6.465939182296943e-05,
"loss": 0.3528,
"step": 920
},
{
"epoch": 1.4038333807761647,
"grad_norm": 0.06482362002134323,
"learning_rate": 6.42367439501193e-05,
"loss": 0.3154,
"step": 925
},
{
"epoch": 1.411424233798273,
"grad_norm": 0.17551842331886292,
"learning_rate": 6.381298614696094e-05,
"loss": 0.2889,
"step": 930
},
{
"epoch": 1.4190150868203815,
"grad_norm": 0.09612052887678146,
"learning_rate": 6.338815145065727e-05,
"loss": 0.2348,
"step": 935
},
{
"epoch": 1.4266059398424897,
"grad_norm": 0.20216520130634308,
"learning_rate": 6.296227298232834e-05,
"loss": 0.3356,
"step": 940
},
{
"epoch": 1.4341967928645982,
"grad_norm": 0.11559966951608658,
"learning_rate": 6.253538394446914e-05,
"loss": 0.3254,
"step": 945
},
{
"epoch": 1.4417876458867065,
"grad_norm": 0.1131603941321373,
"learning_rate": 6.210751761836105e-05,
"loss": 0.2883,
"step": 950
},
{
"epoch": 1.449378498908815,
"grad_norm": 0.3986540734767914,
"learning_rate": 6.167870736147713e-05,
"loss": 0.3648,
"step": 955
},
{
"epoch": 1.4569693519309233,
"grad_norm": 0.3160419464111328,
"learning_rate": 6.124898660488158e-05,
"loss": 0.3506,
"step": 960
},
{
"epoch": 1.4645602049530315,
"grad_norm": 0.09258974343538284,
"learning_rate": 6.081838885062328e-05,
"loss": 0.3197,
"step": 965
},
{
"epoch": 1.47215105797514,
"grad_norm": 0.14045535027980804,
"learning_rate": 6.038694766912394e-05,
"loss": 0.3634,
"step": 970
},
{
"epoch": 1.4797419109972483,
"grad_norm": 0.1499331146478653,
"learning_rate": 5.9954696696560844e-05,
"loss": 0.2952,
"step": 975
},
{
"epoch": 1.4873327640193565,
"grad_norm": 0.12873616814613342,
"learning_rate": 5.952166963224451e-05,
"loss": 0.3923,
"step": 980
},
{
"epoch": 1.494923617041465,
"grad_norm": 0.05709124356508255,
"learning_rate": 5.908790023599144e-05,
"loss": 0.3321,
"step": 985
},
{
"epoch": 1.5025144700635735,
"grad_norm": 0.07169659435749054,
"learning_rate": 5.865342232549204e-05,
"loss": 0.3493,
"step": 990
},
{
"epoch": 1.5101053230856818,
"grad_norm": 0.11258374899625778,
"learning_rate": 5.8218269773674195e-05,
"loss": 0.3039,
"step": 995
},
{
"epoch": 1.51769617610779,
"grad_norm": 0.0858062282204628,
"learning_rate": 5.778247650606242e-05,
"loss": 0.2878,
"step": 1000
},
{
"epoch": 1.51769617610779,
"eval_loss": 0.5372660756111145,
"eval_runtime": 246.0786,
"eval_samples_per_second": 9.521,
"eval_steps_per_second": 9.521,
"step": 1000
},
{
"epoch": 1.5252870291298986,
"grad_norm": 0.09433668851852417,
"learning_rate": 5.734607649813297e-05,
"loss": 0.2803,
"step": 1005
},
{
"epoch": 1.5328778821520068,
"grad_norm": 0.18208527565002441,
"learning_rate": 5.6909103772665015e-05,
"loss": 0.3029,
"step": 1010
},
{
"epoch": 1.540468735174115,
"grad_norm": 0.1644572615623474,
"learning_rate": 5.647159239708809e-05,
"loss": 0.2799,
"step": 1015
},
{
"epoch": 1.5480595881962236,
"grad_norm": 0.13228443264961243,
"learning_rate": 5.603357648082622e-05,
"loss": 0.3209,
"step": 1020
},
{
"epoch": 1.555650441218332,
"grad_norm": 0.08718783408403397,
"learning_rate": 5.559509017263862e-05,
"loss": 0.3287,
"step": 1025
},
{
"epoch": 1.5632412942404401,
"grad_norm": 0.20194876194000244,
"learning_rate": 5.515616765795736e-05,
"loss": 0.3244,
"step": 1030
},
{
"epoch": 1.5708321472625486,
"grad_norm": 0.319332093000412,
"learning_rate": 5.471684315622218e-05,
"loss": 0.2786,
"step": 1035
},
{
"epoch": 1.578423000284657,
"grad_norm": 0.07592841982841492,
"learning_rate": 5.42771509182127e-05,
"loss": 0.404,
"step": 1040
},
{
"epoch": 1.5860138533067654,
"grad_norm": 0.07565618306398392,
"learning_rate": 5.383712522337817e-05,
"loss": 0.3701,
"step": 1045
},
{
"epoch": 1.5936047063288736,
"grad_norm": 0.14734983444213867,
"learning_rate": 5.339680037716487e-05,
"loss": 0.2615,
"step": 1050
},
{
"epoch": 1.6011955593509821,
"grad_norm": 0.12061991542577744,
"learning_rate": 5.2956210708341657e-05,
"loss": 0.4049,
"step": 1055
},
{
"epoch": 1.6087864123730904,
"grad_norm": 0.10585551708936691,
"learning_rate": 5.2515390566323574e-05,
"loss": 0.2495,
"step": 1060
},
{
"epoch": 1.6163772653951987,
"grad_norm": 0.19110319018363953,
"learning_rate": 5.2074374318493915e-05,
"loss": 0.2675,
"step": 1065
},
{
"epoch": 1.6239681184173071,
"grad_norm": 0.09903518110513687,
"learning_rate": 5.163319634752484e-05,
"loss": 0.3348,
"step": 1070
},
{
"epoch": 1.6315589714394156,
"grad_norm": 0.21369488537311554,
"learning_rate": 5.119189104869683e-05,
"loss": 0.3546,
"step": 1075
},
{
"epoch": 1.639149824461524,
"grad_norm": 0.051785871386528015,
"learning_rate": 5.075049282721715e-05,
"loss": 0.2549,
"step": 1080
},
{
"epoch": 1.6467406774836322,
"grad_norm": 0.08246368169784546,
"learning_rate": 5.030903609553753e-05,
"loss": 0.4519,
"step": 1085
},
{
"epoch": 1.6543315305057407,
"grad_norm": 0.16289927065372467,
"learning_rate": 4.9867555270671296e-05,
"loss": 0.323,
"step": 1090
},
{
"epoch": 1.661922383527849,
"grad_norm": 0.09280192106962204,
"learning_rate": 4.942608477151013e-05,
"loss": 0.3735,
"step": 1095
},
{
"epoch": 1.6695132365499572,
"grad_norm": 0.16873304545879364,
"learning_rate": 4.898465901614072e-05,
"loss": 0.3447,
"step": 1100
},
{
"epoch": 1.6771040895720657,
"grad_norm": 0.12987276911735535,
"learning_rate": 4.8543312419161396e-05,
"loss": 0.3249,
"step": 1105
},
{
"epoch": 1.6846949425941742,
"grad_norm": 0.08189664781093597,
"learning_rate": 4.8102079388999106e-05,
"loss": 0.3743,
"step": 1110
},
{
"epoch": 1.6922857956162822,
"grad_norm": 0.14963483810424805,
"learning_rate": 4.7660994325226906e-05,
"loss": 0.2677,
"step": 1115
},
{
"epoch": 1.6998766486383907,
"grad_norm": 0.08255660533905029,
"learning_rate": 4.722009161588199e-05,
"loss": 0.2741,
"step": 1120
},
{
"epoch": 1.7074675016604992,
"grad_norm": 0.14731058478355408,
"learning_rate": 4.67794056347848e-05,
"loss": 0.3997,
"step": 1125
},
{
"epoch": 1.7150583546826075,
"grad_norm": 0.0929296612739563,
"learning_rate": 4.63389707388591e-05,
"loss": 0.3421,
"step": 1130
},
{
"epoch": 1.7226492077047157,
"grad_norm": 0.1865396797657013,
"learning_rate": 4.589882126545352e-05,
"loss": 0.3422,
"step": 1135
},
{
"epoch": 1.7302400607268242,
"grad_norm": 0.22962845861911774,
"learning_rate": 4.545899152966439e-05,
"loss": 0.3236,
"step": 1140
},
{
"epoch": 1.7378309137489325,
"grad_norm": 0.07792218029499054,
"learning_rate": 4.501951582166061e-05,
"loss": 0.3311,
"step": 1145
},
{
"epoch": 1.7454217667710408,
"grad_norm": 0.09238504618406296,
"learning_rate": 4.458042840401019e-05,
"loss": 0.3918,
"step": 1150
},
{
"epoch": 1.7530126197931493,
"grad_norm": 0.3078314960002899,
"learning_rate": 4.414176350900909e-05,
"loss": 0.2956,
"step": 1155
},
{
"epoch": 1.7606034728152578,
"grad_norm": 0.2649385929107666,
"learning_rate": 4.370355533601249e-05,
"loss": 0.3388,
"step": 1160
},
{
"epoch": 1.768194325837366,
"grad_norm": 0.19793756306171417,
"learning_rate": 4.3265838048768334e-05,
"loss": 0.2931,
"step": 1165
},
{
"epoch": 1.7757851788594743,
"grad_norm": 0.06952016800642014,
"learning_rate": 4.282864577275403e-05,
"loss": 0.2593,
"step": 1170
},
{
"epoch": 1.7833760318815828,
"grad_norm": 0.08573190122842789,
"learning_rate": 4.2392012592515785e-05,
"loss": 0.3249,
"step": 1175
},
{
"epoch": 1.790966884903691,
"grad_norm": 0.17260582745075226,
"learning_rate": 4.195597254901147e-05,
"loss": 0.2837,
"step": 1180
},
{
"epoch": 1.7985577379257993,
"grad_norm": 0.14388003945350647,
"learning_rate": 4.152055963695652e-05,
"loss": 0.3507,
"step": 1185
},
{
"epoch": 1.8061485909479078,
"grad_norm": 0.09567199647426605,
"learning_rate": 4.1085807802173796e-05,
"loss": 0.3801,
"step": 1190
},
{
"epoch": 1.8137394439700163,
"grad_norm": 0.052392423152923584,
"learning_rate": 4.065175093894694e-05,
"loss": 0.3151,
"step": 1195
},
{
"epoch": 1.8213302969921243,
"grad_norm": 0.1070229709148407,
"learning_rate": 4.021842288737797e-05,
"loss": 0.341,
"step": 1200
},
{
"epoch": 1.8289211500142328,
"grad_norm": 0.19132941961288452,
"learning_rate": 3.978585743074909e-05,
"loss": 0.2827,
"step": 1205
},
{
"epoch": 1.8365120030363413,
"grad_norm": 0.14613649249076843,
"learning_rate": 3.9354088292888716e-05,
"loss": 0.3597,
"step": 1210
},
{
"epoch": 1.8441028560584496,
"grad_norm": 0.3268943727016449,
"learning_rate": 3.89231491355424e-05,
"loss": 0.3455,
"step": 1215
},
{
"epoch": 1.8516937090805579,
"grad_norm": 0.15382400155067444,
"learning_rate": 3.849307355574844e-05,
"loss": 0.3158,
"step": 1220
},
{
"epoch": 1.8592845621026663,
"grad_norm": 0.09326305985450745,
"learning_rate": 3.80638950832186e-05,
"loss": 0.286,
"step": 1225
},
{
"epoch": 1.8668754151247746,
"grad_norm": 0.32320550084114075,
"learning_rate": 3.763564717772397e-05,
"loss": 0.3412,
"step": 1230
},
{
"epoch": 1.8744662681468829,
"grad_norm": 0.08675755560398102,
"learning_rate": 3.720836322648652e-05,
"loss": 0.2893,
"step": 1235
},
{
"epoch": 1.8820571211689914,
"grad_norm": 0.19032827019691467,
"learning_rate": 3.6782076541576e-05,
"loss": 0.307,
"step": 1240
},
{
"epoch": 1.8896479741910999,
"grad_norm": 0.09857955574989319,
"learning_rate": 3.635682035731292e-05,
"loss": 0.2604,
"step": 1245
},
{
"epoch": 1.8972388272132081,
"grad_norm": 0.05677346885204315,
"learning_rate": 3.5932627827677536e-05,
"loss": 0.3252,
"step": 1250
},
{
"epoch": 1.9048296802353164,
"grad_norm": 0.10897451639175415,
"learning_rate": 3.550953202372503e-05,
"loss": 0.4204,
"step": 1255
},
{
"epoch": 1.912420533257425,
"grad_norm": 0.12246419489383698,
"learning_rate": 3.5087565931007316e-05,
"loss": 0.3542,
"step": 1260
},
{
"epoch": 1.9200113862795332,
"grad_norm": 0.09597641974687576,
"learning_rate": 3.466676244700127e-05,
"loss": 0.2573,
"step": 1265
},
{
"epoch": 1.9276022393016414,
"grad_norm": 0.09933824092149734,
"learning_rate": 3.4247154378544087e-05,
"loss": 0.2844,
"step": 1270
},
{
"epoch": 1.93519309232375,
"grad_norm": 0.25755199790000916,
"learning_rate": 3.382877443927549e-05,
"loss": 0.2835,
"step": 1275
},
{
"epoch": 1.9427839453458582,
"grad_norm": 0.15798412263393402,
"learning_rate": 3.34116552470874e-05,
"loss": 0.3273,
"step": 1280
},
{
"epoch": 1.9503747983679665,
"grad_norm": 0.2535303235054016,
"learning_rate": 3.299582932158085e-05,
"loss": 0.2858,
"step": 1285
},
{
"epoch": 1.957965651390075,
"grad_norm": 0.06856276839971542,
"learning_rate": 3.258132908153074e-05,
"loss": 0.3002,
"step": 1290
},
{
"epoch": 1.9655565044121834,
"grad_norm": 0.13859562575817108,
"learning_rate": 3.216818684235844e-05,
"loss": 0.2957,
"step": 1295
},
{
"epoch": 1.9731473574342917,
"grad_norm": 0.1521427035331726,
"learning_rate": 3.1756434813612266e-05,
"loss": 0.2836,
"step": 1300
},
{
"epoch": 1.9807382104564,
"grad_norm": 0.15251488983631134,
"learning_rate": 3.134610509645655e-05,
"loss": 0.3087,
"step": 1305
},
{
"epoch": 1.9883290634785085,
"grad_norm": 0.15713337063789368,
"learning_rate": 3.093722968116873e-05,
"loss": 0.2847,
"step": 1310
},
{
"epoch": 1.9959199165006167,
"grad_norm": 0.07002197206020355,
"learning_rate": 3.052984044464548e-05,
"loss": 0.281,
"step": 1315
},
{
"epoch": 2.003036341208843,
"grad_norm": 0.11551013588905334,
"learning_rate": 3.012396914791744e-05,
"loss": 0.362,
"step": 1320
},
{
"epoch": 2.0106271942309517,
"grad_norm": 0.18070758879184723,
"learning_rate": 2.971964743367309e-05,
"loss": 0.2798,
"step": 1325
},
{
"epoch": 2.01821804725306,
"grad_norm": 0.12462901324033737,
"learning_rate": 2.9316906823791753e-05,
"loss": 0.2792,
"step": 1330
},
{
"epoch": 2.0258089002751682,
"grad_norm": 0.1277770847082138,
"learning_rate": 2.8915778716886093e-05,
"loss": 0.3313,
"step": 1335
},
{
"epoch": 2.0333997532972767,
"grad_norm": 0.08295406401157379,
"learning_rate": 2.8516294385854282e-05,
"loss": 0.3326,
"step": 1340
},
{
"epoch": 2.0409906063193852,
"grad_norm": 0.22416232526302338,
"learning_rate": 2.811848497544175e-05,
"loss": 0.2702,
"step": 1345
},
{
"epoch": 2.0485814593414937,
"grad_norm": 0.23556384444236755,
"learning_rate": 2.7722381499813233e-05,
"loss": 0.3045,
"step": 1350
},
{
"epoch": 2.0561723123636018,
"grad_norm": 0.21475334465503693,
"learning_rate": 2.7328014840134658e-05,
"loss": 0.3191,
"step": 1355
},
{
"epoch": 2.0637631653857103,
"grad_norm": 0.14084239304065704,
"learning_rate": 2.693541574216575e-05,
"loss": 0.3398,
"step": 1360
},
{
"epoch": 2.0713540184078187,
"grad_norm": 0.1728208363056183,
"learning_rate": 2.6544614813862857e-05,
"loss": 0.2602,
"step": 1365
},
{
"epoch": 2.078944871429927,
"grad_norm": 0.12706124782562256,
"learning_rate": 2.61556425229928e-05,
"loss": 0.2541,
"step": 1370
},
{
"epoch": 2.0865357244520353,
"grad_norm": 0.12707297503948212,
"learning_rate": 2.5768529194757474e-05,
"loss": 0.242,
"step": 1375
},
{
"epoch": 2.0941265774741438,
"grad_norm": 0.04940659552812576,
"learning_rate": 2.538330500942963e-05,
"loss": 0.3645,
"step": 1380
},
{
"epoch": 2.101717430496252,
"grad_norm": 0.16009534895420074,
"learning_rate": 2.500000000000001e-05,
"loss": 0.2547,
"step": 1385
},
{
"epoch": 2.1093082835183603,
"grad_norm": 0.3543836176395416,
"learning_rate": 2.4618644049835782e-05,
"loss": 0.3354,
"step": 1390
},
{
"epoch": 2.116899136540469,
"grad_norm": 0.10430457442998886,
"learning_rate": 2.4239266890350904e-05,
"loss": 0.283,
"step": 1395
},
{
"epoch": 2.1244899895625773,
"grad_norm": 0.14833734929561615,
"learning_rate": 2.3861898098688057e-05,
"loss": 0.3425,
"step": 1400
},
{
"epoch": 2.1320808425846853,
"grad_norm": 0.272958904504776,
"learning_rate": 2.3486567095412864e-05,
"loss": 0.3378,
"step": 1405
},
{
"epoch": 2.139671695606794,
"grad_norm": 0.10397733747959137,
"learning_rate": 2.3113303142220094e-05,
"loss": 0.2804,
"step": 1410
},
{
"epoch": 2.1472625486289023,
"grad_norm": 0.10424613207578659,
"learning_rate": 2.2742135339652398e-05,
"loss": 0.2909,
"step": 1415
},
{
"epoch": 2.1548534016510104,
"grad_norm": 0.1362823098897934,
"learning_rate": 2.2373092624831566e-05,
"loss": 0.2652,
"step": 1420
},
{
"epoch": 2.162444254673119,
"grad_norm": 0.13256202638149261,
"learning_rate": 2.2006203769202482e-05,
"loss": 0.2872,
"step": 1425
},
{
"epoch": 2.1700351076952273,
"grad_norm": 0.30884748697280884,
"learning_rate": 2.1641497376290122e-05,
"loss": 0.3112,
"step": 1430
},
{
"epoch": 2.177625960717336,
"grad_norm": 0.1805962771177292,
"learning_rate": 2.1279001879469424e-05,
"loss": 0.2451,
"step": 1435
},
{
"epoch": 2.185216813739444,
"grad_norm": 0.11278484761714935,
"learning_rate": 2.0918745539748686e-05,
"loss": 0.2565,
"step": 1440
},
{
"epoch": 2.1928076667615524,
"grad_norm": 0.15021051466464996,
"learning_rate": 2.0560756443566148e-05,
"loss": 0.2576,
"step": 1445
},
{
"epoch": 2.200398519783661,
"grad_norm": 0.12693342566490173,
"learning_rate": 2.0205062500600446e-05,
"loss": 0.2776,
"step": 1450
},
{
"epoch": 2.207989372805769,
"grad_norm": 0.08629851788282394,
"learning_rate": 1.985169144159456e-05,
"loss": 0.3334,
"step": 1455
},
{
"epoch": 2.2155802258278774,
"grad_norm": 0.10110223293304443,
"learning_rate": 1.9500670816193968e-05,
"loss": 0.2517,
"step": 1460
},
{
"epoch": 2.223171078849986,
"grad_norm": 0.27294450998306274,
"learning_rate": 1.9152027990798748e-05,
"loss": 0.3316,
"step": 1465
},
{
"epoch": 2.230761931872094,
"grad_norm": 0.22429701685905457,
"learning_rate": 1.8805790146430063e-05,
"loss": 0.2648,
"step": 1470
},
{
"epoch": 2.2383527848942024,
"grad_norm": 0.13777467608451843,
"learning_rate": 1.8461984276611084e-05,
"loss": 0.346,
"step": 1475
},
{
"epoch": 2.245943637916311,
"grad_norm": 0.30676957964897156,
"learning_rate": 1.8120637185262418e-05,
"loss": 0.4114,
"step": 1480
},
{
"epoch": 2.2535344909384194,
"grad_norm": 0.15495023131370544,
"learning_rate": 1.778177548461255e-05,
"loss": 0.3818,
"step": 1485
},
{
"epoch": 2.2611253439605274,
"grad_norm": 0.13254746794700623,
"learning_rate": 1.744542559312295e-05,
"loss": 0.3267,
"step": 1490
},
{
"epoch": 2.268716196982636,
"grad_norm": 0.1586749106645584,
"learning_rate": 1.7111613733428522e-05,
"loss": 0.2979,
"step": 1495
},
{
"epoch": 2.2763070500047444,
"grad_norm": 0.14043815433979034,
"learning_rate": 1.6780365930293163e-05,
"loss": 0.2736,
"step": 1500
},
{
"epoch": 2.2763070500047444,
"eval_loss": 0.4912321865558624,
"eval_runtime": 247.8053,
"eval_samples_per_second": 9.455,
"eval_steps_per_second": 9.455,
"step": 1500
},
{
"epoch": 2.2838979030268525,
"grad_norm": 0.08288119733333588,
"learning_rate": 1.6451708008580907e-05,
"loss": 0.2354,
"step": 1505
},
{
"epoch": 2.291488756048961,
"grad_norm": 0.23708699643611908,
"learning_rate": 1.6125665591242433e-05,
"loss": 0.3625,
"step": 1510
},
{
"epoch": 2.2990796090710695,
"grad_norm": 0.030745351687073708,
"learning_rate": 1.58022640973175e-05,
"loss": 0.266,
"step": 1515
},
{
"epoch": 2.3066704620931775,
"grad_norm": 0.11616528779268265,
"learning_rate": 1.5481528739953272e-05,
"loss": 0.3145,
"step": 1520
},
{
"epoch": 2.314261315115286,
"grad_norm": 0.25872117280960083,
"learning_rate": 1.5163484524438516e-05,
"loss": 0.2714,
"step": 1525
},
{
"epoch": 2.3218521681373945,
"grad_norm": 0.10753320902585983,
"learning_rate": 1.4848156246254263e-05,
"loss": 0.2801,
"step": 1530
},
{
"epoch": 2.329443021159503,
"grad_norm": 0.18508432805538177,
"learning_rate": 1.4535568489140594e-05,
"loss": 0.2678,
"step": 1535
},
{
"epoch": 2.337033874181611,
"grad_norm": 0.18051201105117798,
"learning_rate": 1.422574562318007e-05,
"loss": 0.2981,
"step": 1540
},
{
"epoch": 2.3446247272037195,
"grad_norm": 0.12133996188640594,
"learning_rate": 1.3918711802897789e-05,
"loss": 0.286,
"step": 1545
},
{
"epoch": 2.352215580225828,
"grad_norm": 0.13994358479976654,
"learning_rate": 1.3614490965378257e-05,
"loss": 0.274,
"step": 1550
},
{
"epoch": 2.359806433247936,
"grad_norm": 0.1304178684949875,
"learning_rate": 1.3313106828399147e-05,
"loss": 0.3661,
"step": 1555
},
{
"epoch": 2.3673972862700445,
"grad_norm": 0.30562353134155273,
"learning_rate": 1.3014582888582232e-05,
"loss": 0.2732,
"step": 1560
},
{
"epoch": 2.374988139292153,
"grad_norm": 0.03359885513782501,
"learning_rate": 1.271894241956158e-05,
"loss": 0.286,
"step": 1565
},
{
"epoch": 2.3825789923142615,
"grad_norm": 0.33226141333580017,
"learning_rate": 1.2426208470168965e-05,
"loss": 0.2796,
"step": 1570
},
{
"epoch": 2.3901698453363696,
"grad_norm": 0.07864666730165482,
"learning_rate": 1.213640386263708e-05,
"loss": 0.2444,
"step": 1575
},
{
"epoch": 2.397760698358478,
"grad_norm": 0.12879331409931183,
"learning_rate": 1.1849551190820125e-05,
"loss": 0.3099,
"step": 1580
},
{
"epoch": 2.4053515513805865,
"grad_norm": 0.12465628236532211,
"learning_rate": 1.156567281843241e-05,
"loss": 0.2591,
"step": 1585
},
{
"epoch": 2.4129424044026946,
"grad_norm": 0.22187581658363342,
"learning_rate": 1.1284790877304807e-05,
"loss": 0.3553,
"step": 1590
},
{
"epoch": 2.420533257424803,
"grad_norm": 0.16689549386501312,
"learning_rate": 1.1006927265659334e-05,
"loss": 0.2627,
"step": 1595
},
{
"epoch": 2.4281241104469116,
"grad_norm": 0.12732334434986115,
"learning_rate": 1.0732103646401847e-05,
"loss": 0.2605,
"step": 1600
},
{
"epoch": 2.43571496346902,
"grad_norm": 0.051952481269836426,
"learning_rate": 1.0460341445433191e-05,
"loss": 0.339,
"step": 1605
},
{
"epoch": 2.443305816491128,
"grad_norm": 0.13088718056678772,
"learning_rate": 1.0191661849978824e-05,
"loss": 0.3354,
"step": 1610
},
{
"epoch": 2.4508966695132366,
"grad_norm": 0.12184888124465942,
"learning_rate": 9.926085806936918e-06,
"loss": 0.3414,
"step": 1615
},
{
"epoch": 2.458487522535345,
"grad_norm": 0.10065177083015442,
"learning_rate": 9.663634021245399e-06,
"loss": 0.2189,
"step": 1620
},
{
"epoch": 2.466078375557453,
"grad_norm": 0.09585690498352051,
"learning_rate": 9.404326954267634e-06,
"loss": 0.1881,
"step": 1625
},
{
"epoch": 2.4736692285795616,
"grad_norm": 0.23267728090286255,
"learning_rate": 9.148184822197282e-06,
"loss": 0.3027,
"step": 1630
},
{
"epoch": 2.48126008160167,
"grad_norm": 0.15521545708179474,
"learning_rate": 8.895227594482164e-06,
"loss": 0.2562,
"step": 1635
},
{
"epoch": 2.488850934623778,
"grad_norm": 0.12793651223182678,
"learning_rate": 8.645474992267438e-06,
"loss": 0.244,
"step": 1640
},
{
"epoch": 2.4964417876458866,
"grad_norm": 0.2494293749332428,
"learning_rate": 8.398946486858029e-06,
"loss": 0.3017,
"step": 1645
},
{
"epoch": 2.504032640667995,
"grad_norm": 0.15070238709449768,
"learning_rate": 8.155661298200635e-06,
"loss": 0.2686,
"step": 1650
},
{
"epoch": 2.511623493690103,
"grad_norm": 0.11418969929218292,
"learning_rate": 7.915638393385316e-06,
"loss": 0.3993,
"step": 1655
},
{
"epoch": 2.5192143467122117,
"grad_norm": 0.23196111619472504,
"learning_rate": 7.67889648516672e-06,
"loss": 0.316,
"step": 1660
},
{
"epoch": 2.52680519973432,
"grad_norm": 0.13710691034793854,
"learning_rate": 7.445454030505256e-06,
"loss": 0.3051,
"step": 1665
},
{
"epoch": 2.5343960527564287,
"grad_norm": 0.12727871537208557,
"learning_rate": 7.215329229128076e-06,
"loss": 0.2903,
"step": 1670
},
{
"epoch": 2.5419869057785367,
"grad_norm": 0.11962519586086273,
"learning_rate": 6.988540022110235e-06,
"loss": 0.3204,
"step": 1675
},
{
"epoch": 2.549577758800645,
"grad_norm": 0.14515872299671173,
"learning_rate": 6.765104090475932e-06,
"loss": 0.3121,
"step": 1680
},
{
"epoch": 2.5571686118227537,
"grad_norm": 0.2983834445476532,
"learning_rate": 6.545038853820096e-06,
"loss": 0.3353,
"step": 1685
},
{
"epoch": 2.5647594648448617,
"grad_norm": 0.1801896095275879,
"learning_rate": 6.328361468950267e-06,
"loss": 0.4262,
"step": 1690
},
{
"epoch": 2.57235031786697,
"grad_norm": 0.07517129927873611,
"learning_rate": 6.115088828549003e-06,
"loss": 0.2728,
"step": 1695
},
{
"epoch": 2.5799411708890787,
"grad_norm": 0.19445601105690002,
"learning_rate": 5.905237559856974e-06,
"loss": 0.3221,
"step": 1700
},
{
"epoch": 2.587532023911187,
"grad_norm": 0.10309174656867981,
"learning_rate": 5.698824023376531e-06,
"loss": 0.2641,
"step": 1705
},
{
"epoch": 2.5951228769332952,
"grad_norm": 0.14000511169433594,
"learning_rate": 5.495864311596343e-06,
"loss": 0.2636,
"step": 1710
},
{
"epoch": 2.6027137299554037,
"grad_norm": 0.11765392124652863,
"learning_rate": 5.296374247736635e-06,
"loss": 0.2529,
"step": 1715
},
{
"epoch": 2.6103045829775122,
"grad_norm": 0.1269364356994629,
"learning_rate": 5.100369384515735e-06,
"loss": 0.2678,
"step": 1720
},
{
"epoch": 2.6178954359996203,
"grad_norm": 0.06566209346055984,
"learning_rate": 4.907865002937406e-06,
"loss": 0.26,
"step": 1725
},
{
"epoch": 2.6254862890217288,
"grad_norm": 0.07295841723680496,
"learning_rate": 4.718876111099613e-06,
"loss": 0.2591,
"step": 1730
},
{
"epoch": 2.6330771420438372,
"grad_norm": 0.17099271714687347,
"learning_rate": 4.533417443024374e-06,
"loss": 0.24,
"step": 1735
},
{
"epoch": 2.6406679950659457,
"grad_norm": 0.16684836149215698,
"learning_rate": 4.351503457509093e-06,
"loss": 0.2542,
"step": 1740
},
{
"epoch": 2.648258848088054,
"grad_norm": 0.1620602011680603,
"learning_rate": 4.17314833699935e-06,
"loss": 0.2782,
"step": 1745
},
{
"epoch": 2.6558497011101623,
"grad_norm": 0.1975618451833725,
"learning_rate": 3.998365986483143e-06,
"loss": 0.2835,
"step": 1750
},
{
"epoch": 2.6634405541322708,
"grad_norm": 0.04293884336948395,
"learning_rate": 3.827170032406851e-06,
"loss": 0.2825,
"step": 1755
},
{
"epoch": 2.671031407154379,
"grad_norm": 0.09690513461828232,
"learning_rate": 3.6595738216128994e-06,
"loss": 0.3289,
"step": 1760
},
{
"epoch": 2.6786222601764873,
"grad_norm": 0.029426729306578636,
"learning_rate": 3.495590420299194e-06,
"loss": 0.2791,
"step": 1765
},
{
"epoch": 2.686213113198596,
"grad_norm": 0.11573446542024612,
"learning_rate": 3.335232613000433e-06,
"loss": 0.3137,
"step": 1770
},
{
"epoch": 2.6938039662207043,
"grad_norm": 0.26567184925079346,
"learning_rate": 3.1785129015914296e-06,
"loss": 0.3154,
"step": 1775
},
{
"epoch": 2.7013948192428123,
"grad_norm": 0.13201837241649628,
"learning_rate": 3.0254435043124083e-06,
"loss": 0.3069,
"step": 1780
},
{
"epoch": 2.708985672264921,
"grad_norm": 0.29772883653640747,
"learning_rate": 2.876036354816436e-06,
"loss": 0.2662,
"step": 1785
},
{
"epoch": 2.716576525287029,
"grad_norm": 0.08354216068983078,
"learning_rate": 2.73030310123909e-06,
"loss": 0.3624,
"step": 1790
},
{
"epoch": 2.7241673783091374,
"grad_norm": 0.259625107049942,
"learning_rate": 2.5882551052902883e-06,
"loss": 0.2486,
"step": 1795
},
{
"epoch": 2.731758231331246,
"grad_norm": 0.22710494697093964,
"learning_rate": 2.4499034413685395e-06,
"loss": 0.3115,
"step": 1800
},
{
"epoch": 2.7393490843533543,
"grad_norm": 0.07519204914569855,
"learning_rate": 2.3152588956975365e-06,
"loss": 0.2924,
"step": 1805
},
{
"epoch": 2.746939937375463,
"grad_norm": 0.18423676490783691,
"learning_rate": 2.184331965485259e-06,
"loss": 0.3002,
"step": 1810
},
{
"epoch": 2.754530790397571,
"grad_norm": 0.1939702183008194,
"learning_rate": 2.057132858105548e-06,
"loss": 0.3338,
"step": 1815
},
{
"epoch": 2.7621216434196794,
"grad_norm": 0.20939414203166962,
"learning_rate": 1.93367149030238e-06,
"loss": 0.2796,
"step": 1820
},
{
"epoch": 2.7697124964417874,
"grad_norm": 0.26544660329818726,
"learning_rate": 1.813957487416651e-06,
"loss": 0.3037,
"step": 1825
},
{
"epoch": 2.777303349463896,
"grad_norm": 0.20273450016975403,
"learning_rate": 1.6980001826358226e-06,
"loss": 0.2634,
"step": 1830
},
{
"epoch": 2.7848942024860044,
"grad_norm": 0.28037548065185547,
"learning_rate": 1.585808616266271e-06,
"loss": 0.4404,
"step": 1835
},
{
"epoch": 2.792485055508113,
"grad_norm": 0.18899406492710114,
"learning_rate": 1.4773915350284772e-06,
"loss": 0.3411,
"step": 1840
},
{
"epoch": 2.800075908530221,
"grad_norm": 0.3918057382106781,
"learning_rate": 1.3727573913751013e-06,
"loss": 0.2854,
"step": 1845
},
{
"epoch": 2.8076667615523294,
"grad_norm": 0.1583373099565506,
"learning_rate": 1.2719143428320256e-06,
"loss": 0.2327,
"step": 1850
},
{
"epoch": 2.815257614574438,
"grad_norm": 0.1468050479888916,
"learning_rate": 1.1748702513623922e-06,
"loss": 0.2568,
"step": 1855
},
{
"epoch": 2.822848467596546,
"grad_norm": 0.14241372048854828,
"learning_rate": 1.0816326827536083e-06,
"loss": 0.2705,
"step": 1860
},
{
"epoch": 2.8304393206186544,
"grad_norm": 0.1886911541223526,
"learning_rate": 9.92208906027564e-07,
"loss": 0.2619,
"step": 1865
},
{
"epoch": 2.838030173640763,
"grad_norm": 0.16523589193820953,
"learning_rate": 9.066058928738797e-07,
"loss": 0.2628,
"step": 1870
},
{
"epoch": 2.8456210266628714,
"grad_norm": 0.10537305474281311,
"learning_rate": 8.248303171063898e-07,
"loss": 0.2101,
"step": 1875
},
{
"epoch": 2.8532118796849795,
"grad_norm": 0.08789675682783127,
"learning_rate": 7.468885541428438e-07,
"loss": 0.2569,
"step": 1880
},
{
"epoch": 2.860802732707088,
"grad_norm": 0.13445930182933807,
"learning_rate": 6.727866805078531e-07,
"loss": 0.2946,
"step": 1885
},
{
"epoch": 2.8683935857291964,
"grad_norm": 0.1602274775505066,
"learning_rate": 6.02530473359153e-07,
"loss": 0.2775,
"step": 1890
},
{
"epoch": 2.8759844387513045,
"grad_norm": 0.12329945713281631,
"learning_rate": 5.361254100371915e-07,
"loss": 0.2592,
"step": 1895
},
{
"epoch": 2.883575291773413,
"grad_norm": 0.07469865679740906,
"learning_rate": 4.7357666763814813e-07,
"loss": 0.3769,
"step": 1900
},
{
"epoch": 2.8911661447955215,
"grad_norm": 0.11232876777648926,
"learning_rate": 4.148891226102347e-07,
"loss": 0.2907,
"step": 1905
},
{
"epoch": 2.89875699781763,
"grad_norm": 0.20298312604427338,
"learning_rate": 3.600673503736107e-07,
"loss": 0.2756,
"step": 1910
},
{
"epoch": 2.906347850839738,
"grad_norm": 0.24480293691158295,
"learning_rate": 3.0911562496358517e-07,
"loss": 0.2864,
"step": 1915
},
{
"epoch": 2.9139387038618465,
"grad_norm": 0.21817973256111145,
"learning_rate": 2.620379186974664e-07,
"loss": 0.2548,
"step": 1920
},
{
"epoch": 2.921529556883955,
"grad_norm": 0.09590750187635422,
"learning_rate": 2.1883790186483234e-07,
"loss": 0.296,
"step": 1925
},
{
"epoch": 2.929120409906063,
"grad_norm": 0.07791176438331604,
"learning_rate": 1.79518942441409e-07,
"loss": 0.2834,
"step": 1930
},
{
"epoch": 2.9367112629281715,
"grad_norm": 0.1533590704202652,
"learning_rate": 1.440841058264808e-07,
"loss": 0.3561,
"step": 1935
},
{
"epoch": 2.94430211595028,
"grad_norm": 0.046548765152692795,
"learning_rate": 1.1253615460391498e-07,
"loss": 0.227,
"step": 1940
},
{
"epoch": 2.9518929689723885,
"grad_norm": 0.30706334114074707,
"learning_rate": 8.48775483267783e-08,
"loss": 0.2787,
"step": 1945
},
{
"epoch": 2.9594838219944966,
"grad_norm": 0.24995796382427216,
"learning_rate": 6.111044332557936e-08,
"loss": 0.2536,
"step": 1950
},
{
"epoch": 2.967074675016605,
"grad_norm": 0.1498573124408722,
"learning_rate": 4.123669254017526e-08,
"loss": 0.3097,
"step": 1955
},
{
"epoch": 2.974665528038713,
"grad_norm": 0.1821010708808899,
"learning_rate": 2.525784537528164e-08,
"loss": 0.2778,
"step": 1960
},
{
"epoch": 2.9822563810608216,
"grad_norm": 0.07815810292959213,
"learning_rate": 1.3175147579702619e-08,
"loss": 0.2583,
"step": 1965
},
{
"epoch": 2.98984723408293,
"grad_norm": 0.16738541424274445,
"learning_rate": 4.9895411492084656e-09,
"loss": 0.2417,
"step": 1970
},
{
"epoch": 2.9974380871050386,
"grad_norm": 0.0907551497220993,
"learning_rate": 7.016642530777162e-10,
"loss": 0.308,
"step": 1975
}
],
"logging_steps": 5,
"max_steps": 1977,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.852198258216796e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}