Dolphin3.0-Llama3.1-8B/trainer_state.json

42592 lines
1.0 MiB

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99864046471388,
"eval_steps": 506,
"global_step": 6066,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004943764676801385,
"grad_norm": 50.16305697781131,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0593,
"step": 1
},
{
"epoch": 0.0004943764676801385,
"eval_loss": 1.0439037084579468,
"eval_runtime": 100.8565,
"eval_samples_per_second": 300.962,
"eval_steps_per_second": 37.628,
"step": 1
},
{
"epoch": 0.000988752935360277,
"grad_norm": 51.25853385500834,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.0409,
"step": 2
},
{
"epoch": 0.0014831294030404152,
"grad_norm": 40.36823881797247,
"learning_rate": 3e-06,
"loss": 1.0141,
"step": 3
},
{
"epoch": 0.001977505870720554,
"grad_norm": 17.0999595201057,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9595,
"step": 4
},
{
"epoch": 0.002471882338400692,
"grad_norm": 2.5828220351575104,
"learning_rate": 5e-06,
"loss": 0.8015,
"step": 5
},
{
"epoch": 0.0029662588060808304,
"grad_norm": 1.052839044442619,
"learning_rate": 6e-06,
"loss": 0.7433,
"step": 6
},
{
"epoch": 0.003460635273760969,
"grad_norm": 0.6543936962130193,
"learning_rate": 7e-06,
"loss": 0.7402,
"step": 7
},
{
"epoch": 0.003955011741441108,
"grad_norm": 0.3160867481278867,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7089,
"step": 8
},
{
"epoch": 0.004449388209121246,
"grad_norm": 0.2625429913030516,
"learning_rate": 9e-06,
"loss": 0.7146,
"step": 9
},
{
"epoch": 0.004943764676801384,
"grad_norm": 0.30195426819829757,
"learning_rate": 1e-05,
"loss": 0.6943,
"step": 10
},
{
"epoch": 0.005438141144481523,
"grad_norm": 0.3025640856597048,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.6789,
"step": 11
},
{
"epoch": 0.005932517612161661,
"grad_norm": 0.304449919990198,
"learning_rate": 1.2e-05,
"loss": 0.7105,
"step": 12
},
{
"epoch": 0.0064268940798418,
"grad_norm": 0.25670547313250547,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.6653,
"step": 13
},
{
"epoch": 0.006921270547521938,
"grad_norm": 0.319722662176669,
"learning_rate": 1.4e-05,
"loss": 0.7119,
"step": 14
},
{
"epoch": 0.007415647015202077,
"grad_norm": 0.3341022024270933,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.6554,
"step": 15
},
{
"epoch": 0.007910023482882216,
"grad_norm": 0.2417066940171013,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6636,
"step": 16
},
{
"epoch": 0.008404399950562353,
"grad_norm": 0.25568160995672884,
"learning_rate": 1.7e-05,
"loss": 0.6755,
"step": 17
},
{
"epoch": 0.008898776418242492,
"grad_norm": 0.22874905698234116,
"learning_rate": 1.8e-05,
"loss": 0.651,
"step": 18
},
{
"epoch": 0.00939315288592263,
"grad_norm": 0.2328079369429896,
"learning_rate": 1.9e-05,
"loss": 0.6639,
"step": 19
},
{
"epoch": 0.009887529353602768,
"grad_norm": 0.19989889476002418,
"learning_rate": 2e-05,
"loss": 0.6445,
"step": 20
},
{
"epoch": 0.010381905821282907,
"grad_norm": 0.1933533798206734,
"learning_rate": 1.999999924187998e-05,
"loss": 0.6388,
"step": 21
},
{
"epoch": 0.010876282288963045,
"grad_norm": 0.20678455394628042,
"learning_rate": 1.9999996967520037e-05,
"loss": 0.6234,
"step": 22
},
{
"epoch": 0.011370658756643184,
"grad_norm": 0.19793648580115647,
"learning_rate": 1.9999993176920513e-05,
"loss": 0.6602,
"step": 23
},
{
"epoch": 0.011865035224323322,
"grad_norm": 0.1686698024537807,
"learning_rate": 1.9999987870081987e-05,
"loss": 0.625,
"step": 24
},
{
"epoch": 0.01235941169200346,
"grad_norm": 0.17091247442472965,
"learning_rate": 1.999998104700526e-05,
"loss": 0.6601,
"step": 25
},
{
"epoch": 0.0128537881596836,
"grad_norm": 0.1785301399633824,
"learning_rate": 1.9999972707691367e-05,
"loss": 0.6512,
"step": 26
},
{
"epoch": 0.013348164627363738,
"grad_norm": 0.1554938906730645,
"learning_rate": 1.9999962852141573e-05,
"loss": 0.6321,
"step": 27
},
{
"epoch": 0.013842541095043875,
"grad_norm": 0.15664333466237776,
"learning_rate": 1.9999951480357373e-05,
"loss": 0.6441,
"step": 28
},
{
"epoch": 0.014336917562724014,
"grad_norm": 0.1640847231049839,
"learning_rate": 1.999993859234049e-05,
"loss": 0.6265,
"step": 29
},
{
"epoch": 0.014831294030404153,
"grad_norm": 0.1748523028966459,
"learning_rate": 1.9999924188092875e-05,
"loss": 0.6196,
"step": 30
},
{
"epoch": 0.01532567049808429,
"grad_norm": 0.14588244842133785,
"learning_rate": 1.9999908267616722e-05,
"loss": 0.5941,
"step": 31
},
{
"epoch": 0.01582004696576443,
"grad_norm": 0.1554158109898706,
"learning_rate": 1.9999890830914436e-05,
"loss": 0.6004,
"step": 32
},
{
"epoch": 0.016314423433444566,
"grad_norm": 0.16166022340530584,
"learning_rate": 1.9999871877988663e-05,
"loss": 0.6176,
"step": 33
},
{
"epoch": 0.016808799901124705,
"grad_norm": 0.15760053133696716,
"learning_rate": 1.9999851408842276e-05,
"loss": 0.6045,
"step": 34
},
{
"epoch": 0.017303176368804844,
"grad_norm": 0.160652292478991,
"learning_rate": 1.999982942347838e-05,
"loss": 0.6252,
"step": 35
},
{
"epoch": 0.017797552836484983,
"grad_norm": 0.14805324914487572,
"learning_rate": 1.999980592190031e-05,
"loss": 0.6125,
"step": 36
},
{
"epoch": 0.018291929304165122,
"grad_norm": 0.17827435539775832,
"learning_rate": 1.9999780904111628e-05,
"loss": 0.5907,
"step": 37
},
{
"epoch": 0.01878630577184526,
"grad_norm": 0.17402087551850343,
"learning_rate": 1.9999754370116124e-05,
"loss": 0.6118,
"step": 38
},
{
"epoch": 0.0192806822395254,
"grad_norm": 0.15709097790495138,
"learning_rate": 1.9999726319917828e-05,
"loss": 0.5845,
"step": 39
},
{
"epoch": 0.019775058707205535,
"grad_norm": 0.14905512966757198,
"learning_rate": 1.9999696753520988e-05,
"loss": 0.6043,
"step": 40
},
{
"epoch": 0.020269435174885674,
"grad_norm": 0.15273613038485645,
"learning_rate": 1.999966567093009e-05,
"loss": 0.6186,
"step": 41
},
{
"epoch": 0.020763811642565813,
"grad_norm": 0.15763535490287664,
"learning_rate": 1.999963307214984e-05,
"loss": 0.6046,
"step": 42
},
{
"epoch": 0.021258188110245952,
"grad_norm": 0.14105111689756658,
"learning_rate": 1.999959895718519e-05,
"loss": 0.5981,
"step": 43
},
{
"epoch": 0.02175256457792609,
"grad_norm": 0.1651678027726558,
"learning_rate": 1.9999563326041307e-05,
"loss": 0.5998,
"step": 44
},
{
"epoch": 0.02224694104560623,
"grad_norm": 0.15300291404088398,
"learning_rate": 1.9999526178723598e-05,
"loss": 0.6154,
"step": 45
},
{
"epoch": 0.02274131751328637,
"grad_norm": 0.1630037485845006,
"learning_rate": 1.999948751523769e-05,
"loss": 0.6628,
"step": 46
},
{
"epoch": 0.023235693980966508,
"grad_norm": 0.16123630285859641,
"learning_rate": 1.9999447335589445e-05,
"loss": 0.6142,
"step": 47
},
{
"epoch": 0.023730070448646643,
"grad_norm": 0.1650864704929084,
"learning_rate": 1.999940563978496e-05,
"loss": 0.5885,
"step": 48
},
{
"epoch": 0.024224446916326782,
"grad_norm": 0.14229716415132507,
"learning_rate": 1.999936242783056e-05,
"loss": 0.6144,
"step": 49
},
{
"epoch": 0.02471882338400692,
"grad_norm": 0.18396170270711035,
"learning_rate": 1.9999317699732786e-05,
"loss": 0.6188,
"step": 50
},
{
"epoch": 0.02521319985168706,
"grad_norm": 0.15094586517656675,
"learning_rate": 1.999927145549843e-05,
"loss": 0.6131,
"step": 51
},
{
"epoch": 0.0257075763193672,
"grad_norm": 0.14937651087498216,
"learning_rate": 1.9999223695134494e-05,
"loss": 0.5985,
"step": 52
},
{
"epoch": 0.026201952787047338,
"grad_norm": 0.15753625812018066,
"learning_rate": 1.9999174418648232e-05,
"loss": 0.6095,
"step": 53
},
{
"epoch": 0.026696329254727477,
"grad_norm": 0.16631632760551562,
"learning_rate": 1.9999123626047106e-05,
"loss": 0.6058,
"step": 54
},
{
"epoch": 0.027190705722407612,
"grad_norm": 0.1536244785739368,
"learning_rate": 1.999907131733882e-05,
"loss": 0.594,
"step": 55
},
{
"epoch": 0.02768508219008775,
"grad_norm": 0.15371804576780498,
"learning_rate": 1.9999017492531305e-05,
"loss": 0.5867,
"step": 56
},
{
"epoch": 0.02817945865776789,
"grad_norm": 0.19064505776381935,
"learning_rate": 1.9998962151632723e-05,
"loss": 0.6026,
"step": 57
},
{
"epoch": 0.02867383512544803,
"grad_norm": 0.16138644607825361,
"learning_rate": 1.9998905294651462e-05,
"loss": 0.6334,
"step": 58
},
{
"epoch": 0.029168211593128168,
"grad_norm": 0.16755844636120792,
"learning_rate": 1.9998846921596148e-05,
"loss": 0.5857,
"step": 59
},
{
"epoch": 0.029662588060808306,
"grad_norm": 0.14932372004919575,
"learning_rate": 1.999878703247563e-05,
"loss": 0.5976,
"step": 60
},
{
"epoch": 0.030156964528488445,
"grad_norm": 0.16592050596173333,
"learning_rate": 1.9998725627298988e-05,
"loss": 0.6278,
"step": 61
},
{
"epoch": 0.03065134099616858,
"grad_norm": 0.16879045069556234,
"learning_rate": 1.999866270607553e-05,
"loss": 0.5534,
"step": 62
},
{
"epoch": 0.03114571746384872,
"grad_norm": 0.1522597592048158,
"learning_rate": 1.9998598268814803e-05,
"loss": 0.5914,
"step": 63
},
{
"epoch": 0.03164009393152886,
"grad_norm": 0.17000391503525156,
"learning_rate": 1.9998532315526565e-05,
"loss": 0.5903,
"step": 64
},
{
"epoch": 0.032134470399209,
"grad_norm": 0.15415604055811336,
"learning_rate": 1.9998464846220832e-05,
"loss": 0.5818,
"step": 65
},
{
"epoch": 0.03262884686688913,
"grad_norm": 0.16559059791605857,
"learning_rate": 1.9998395860907822e-05,
"loss": 0.6114,
"step": 66
},
{
"epoch": 0.033123223334569275,
"grad_norm": 0.1458045426558071,
"learning_rate": 1.9998325359597998e-05,
"loss": 0.5703,
"step": 67
},
{
"epoch": 0.03361759980224941,
"grad_norm": 0.1616342874885217,
"learning_rate": 1.9998253342302053e-05,
"loss": 0.5818,
"step": 68
},
{
"epoch": 0.03411197626992955,
"grad_norm": 0.15948136880993324,
"learning_rate": 1.9998179809030906e-05,
"loss": 0.5999,
"step": 69
},
{
"epoch": 0.03460635273760969,
"grad_norm": 0.18302716331226213,
"learning_rate": 1.99981047597957e-05,
"loss": 0.5925,
"step": 70
},
{
"epoch": 0.03510072920528983,
"grad_norm": 0.18478485438416417,
"learning_rate": 1.999802819460782e-05,
"loss": 0.6106,
"step": 71
},
{
"epoch": 0.035595105672969966,
"grad_norm": 0.15288453049247022,
"learning_rate": 1.9997950113478875e-05,
"loss": 0.5788,
"step": 72
},
{
"epoch": 0.0360894821406501,
"grad_norm": 0.15533434045572492,
"learning_rate": 1.9997870516420702e-05,
"loss": 0.6076,
"step": 73
},
{
"epoch": 0.036583858608330244,
"grad_norm": 0.1459642570283682,
"learning_rate": 1.999778940344537e-05,
"loss": 0.5885,
"step": 74
},
{
"epoch": 0.03707823507601038,
"grad_norm": 0.16702227516936843,
"learning_rate": 1.999770677456518e-05,
"loss": 0.597,
"step": 75
},
{
"epoch": 0.03757261154369052,
"grad_norm": 0.14734045765133108,
"learning_rate": 1.9997622629792656e-05,
"loss": 0.565,
"step": 76
},
{
"epoch": 0.03806698801137066,
"grad_norm": 0.18704026594211895,
"learning_rate": 1.9997536969140564e-05,
"loss": 0.5589,
"step": 77
},
{
"epoch": 0.0385613644790508,
"grad_norm": 0.14158046157446746,
"learning_rate": 1.9997449792621885e-05,
"loss": 0.5647,
"step": 78
},
{
"epoch": 0.039055740946730935,
"grad_norm": 0.15976974108602848,
"learning_rate": 1.999736110024984e-05,
"loss": 0.5784,
"step": 79
},
{
"epoch": 0.03955011741441107,
"grad_norm": 0.14599225270211502,
"learning_rate": 1.999727089203787e-05,
"loss": 0.5963,
"step": 80
},
{
"epoch": 0.04004449388209121,
"grad_norm": 0.1671645476154032,
"learning_rate": 1.9997179167999666e-05,
"loss": 0.6109,
"step": 81
},
{
"epoch": 0.04053887034977135,
"grad_norm": 0.1635894336601305,
"learning_rate": 1.999708592814913e-05,
"loss": 0.5931,
"step": 82
},
{
"epoch": 0.04103324681745149,
"grad_norm": 0.3039539296262935,
"learning_rate": 1.999699117250039e-05,
"loss": 0.5835,
"step": 83
},
{
"epoch": 0.041527623285131626,
"grad_norm": 0.15175577068503943,
"learning_rate": 1.999689490106783e-05,
"loss": 0.6088,
"step": 84
},
{
"epoch": 0.04202199975281177,
"grad_norm": 0.1851856853501561,
"learning_rate": 1.9996797113866036e-05,
"loss": 0.5736,
"step": 85
},
{
"epoch": 0.042516376220491904,
"grad_norm": 0.2013397293993224,
"learning_rate": 1.9996697810909834e-05,
"loss": 0.6039,
"step": 86
},
{
"epoch": 0.043010752688172046,
"grad_norm": 0.14332078969971054,
"learning_rate": 1.999659699221429e-05,
"loss": 0.5978,
"step": 87
},
{
"epoch": 0.04350512915585218,
"grad_norm": 0.17661895009090986,
"learning_rate": 1.9996494657794678e-05,
"loss": 0.6069,
"step": 88
},
{
"epoch": 0.04399950562353232,
"grad_norm": 0.18494634501877075,
"learning_rate": 1.9996390807666525e-05,
"loss": 0.6006,
"step": 89
},
{
"epoch": 0.04449388209121246,
"grad_norm": 0.13964608891886043,
"learning_rate": 1.9996285441845568e-05,
"loss": 0.592,
"step": 90
},
{
"epoch": 0.044988258558892595,
"grad_norm": 0.1764890796335076,
"learning_rate": 1.9996178560347795e-05,
"loss": 0.6129,
"step": 91
},
{
"epoch": 0.04548263502657274,
"grad_norm": 0.16915623555795675,
"learning_rate": 1.99960701631894e-05,
"loss": 0.5958,
"step": 92
},
{
"epoch": 0.04597701149425287,
"grad_norm": 0.14534126699082817,
"learning_rate": 1.9995960250386822e-05,
"loss": 0.5878,
"step": 93
},
{
"epoch": 0.046471387961933015,
"grad_norm": 0.15655515207026707,
"learning_rate": 1.999584882195673e-05,
"loss": 0.5841,
"step": 94
},
{
"epoch": 0.04696576442961315,
"grad_norm": 0.2920845719904276,
"learning_rate": 1.999573587791602e-05,
"loss": 0.5466,
"step": 95
},
{
"epoch": 0.047460140897293286,
"grad_norm": 0.15101728221997562,
"learning_rate": 1.999562141828181e-05,
"loss": 0.5911,
"step": 96
},
{
"epoch": 0.04795451736497343,
"grad_norm": 0.1460785733168058,
"learning_rate": 1.999550544307146e-05,
"loss": 0.591,
"step": 97
},
{
"epoch": 0.048448893832653564,
"grad_norm": 0.16152529468424331,
"learning_rate": 1.9995387952302557e-05,
"loss": 0.5804,
"step": 98
},
{
"epoch": 0.048943270300333706,
"grad_norm": 0.16419247622456343,
"learning_rate": 1.9995268945992908e-05,
"loss": 0.5939,
"step": 99
},
{
"epoch": 0.04943764676801384,
"grad_norm": 0.14141647238898725,
"learning_rate": 1.9995148424160563e-05,
"loss": 0.5747,
"step": 100
},
{
"epoch": 0.049932023235693984,
"grad_norm": 0.15950261606989505,
"learning_rate": 1.999502638682379e-05,
"loss": 0.5431,
"step": 101
},
{
"epoch": 0.05042639970337412,
"grad_norm": 0.17886811844568348,
"learning_rate": 1.9994902834001104e-05,
"loss": 0.5816,
"step": 102
},
{
"epoch": 0.050920776171054255,
"grad_norm": 0.3499675434954369,
"learning_rate": 1.9994777765711226e-05,
"loss": 0.6119,
"step": 103
},
{
"epoch": 0.0514151526387344,
"grad_norm": 0.16114594215431693,
"learning_rate": 1.999465118197313e-05,
"loss": 0.5812,
"step": 104
},
{
"epoch": 0.05190952910641453,
"grad_norm": 0.16242778060631202,
"learning_rate": 1.9994523082805998e-05,
"loss": 0.5995,
"step": 105
},
{
"epoch": 0.052403905574094675,
"grad_norm": 0.14991207056939862,
"learning_rate": 1.9994393468229263e-05,
"loss": 0.5693,
"step": 106
},
{
"epoch": 0.05289828204177481,
"grad_norm": 0.16517270849490207,
"learning_rate": 1.9994262338262572e-05,
"loss": 0.5732,
"step": 107
},
{
"epoch": 0.05339265850945495,
"grad_norm": 3.2336347776838528,
"learning_rate": 1.999412969292581e-05,
"loss": 0.7778,
"step": 108
},
{
"epoch": 0.05388703497713509,
"grad_norm": 0.2195282738478343,
"learning_rate": 1.9993995532239087e-05,
"loss": 0.5924,
"step": 109
},
{
"epoch": 0.054381411444815224,
"grad_norm": 0.18432747066992086,
"learning_rate": 1.999385985622275e-05,
"loss": 0.6204,
"step": 110
},
{
"epoch": 0.054875787912495366,
"grad_norm": 0.15542586438446251,
"learning_rate": 1.9993722664897358e-05,
"loss": 0.5868,
"step": 111
},
{
"epoch": 0.0553701643801755,
"grad_norm": 0.19921891695906582,
"learning_rate": 1.999358395828373e-05,
"loss": 0.5855,
"step": 112
},
{
"epoch": 0.055864540847855644,
"grad_norm": 0.15870119967019275,
"learning_rate": 1.9993443736402887e-05,
"loss": 0.5737,
"step": 113
},
{
"epoch": 0.05635891731553578,
"grad_norm": 0.4217985166050489,
"learning_rate": 1.9993301999276088e-05,
"loss": 0.5856,
"step": 114
},
{
"epoch": 0.05685329378321592,
"grad_norm": 2.062631830263136,
"learning_rate": 1.9993158746924832e-05,
"loss": 0.6288,
"step": 115
},
{
"epoch": 0.05734767025089606,
"grad_norm": 0.26475685869362076,
"learning_rate": 1.9993013979370836e-05,
"loss": 0.5804,
"step": 116
},
{
"epoch": 0.05784204671857619,
"grad_norm": 0.2653375004809075,
"learning_rate": 1.9992867696636047e-05,
"loss": 0.571,
"step": 117
},
{
"epoch": 0.058336423186256335,
"grad_norm": 0.2327434756538745,
"learning_rate": 1.9992719898742646e-05,
"loss": 0.5726,
"step": 118
},
{
"epoch": 0.05883079965393647,
"grad_norm": 0.2442133583620796,
"learning_rate": 1.9992570585713044e-05,
"loss": 0.5796,
"step": 119
},
{
"epoch": 0.05932517612161661,
"grad_norm": 0.16675485380167532,
"learning_rate": 1.9992419757569884e-05,
"loss": 0.6252,
"step": 120
},
{
"epoch": 0.05981955258929675,
"grad_norm": 0.2106636096285169,
"learning_rate": 1.9992267414336027e-05,
"loss": 0.5712,
"step": 121
},
{
"epoch": 0.06031392905697689,
"grad_norm": 0.2604695555295851,
"learning_rate": 1.999211355603458e-05,
"loss": 0.5699,
"step": 122
},
{
"epoch": 0.060808305524657026,
"grad_norm": 0.20280505457234593,
"learning_rate": 1.9991958182688865e-05,
"loss": 0.5809,
"step": 123
},
{
"epoch": 0.06130268199233716,
"grad_norm": 0.1848016100259557,
"learning_rate": 1.9991801294322445e-05,
"loss": 0.6144,
"step": 124
},
{
"epoch": 0.061797058460017304,
"grad_norm": 0.19904305468964345,
"learning_rate": 1.9991642890959105e-05,
"loss": 0.5903,
"step": 125
},
{
"epoch": 0.06229143492769744,
"grad_norm": 0.2263219031966349,
"learning_rate": 1.9991482972622865e-05,
"loss": 0.5904,
"step": 126
},
{
"epoch": 0.06278581139537757,
"grad_norm": 0.15986684563682377,
"learning_rate": 1.9991321539337974e-05,
"loss": 0.5701,
"step": 127
},
{
"epoch": 0.06328018786305772,
"grad_norm": 0.21045782347422945,
"learning_rate": 1.9991158591128903e-05,
"loss": 0.5983,
"step": 128
},
{
"epoch": 0.06377456433073786,
"grad_norm": 0.14784521190351946,
"learning_rate": 1.9990994128020366e-05,
"loss": 0.5865,
"step": 129
},
{
"epoch": 0.064268940798418,
"grad_norm": 0.20690167500974782,
"learning_rate": 1.9990828150037292e-05,
"loss": 0.5749,
"step": 130
},
{
"epoch": 0.06476331726609813,
"grad_norm": 0.15272841383397337,
"learning_rate": 1.9990660657204853e-05,
"loss": 0.5808,
"step": 131
},
{
"epoch": 0.06525769373377827,
"grad_norm": 0.19969670732968584,
"learning_rate": 1.9990491649548445e-05,
"loss": 0.5962,
"step": 132
},
{
"epoch": 0.06575207020145842,
"grad_norm": 0.1769812290671827,
"learning_rate": 1.9990321127093694e-05,
"loss": 0.5719,
"step": 133
},
{
"epoch": 0.06624644666913855,
"grad_norm": 0.16741409970150983,
"learning_rate": 1.999014908986645e-05,
"loss": 0.5925,
"step": 134
},
{
"epoch": 0.06674082313681869,
"grad_norm": 0.18943771574184248,
"learning_rate": 1.99899755378928e-05,
"loss": 0.5756,
"step": 135
},
{
"epoch": 0.06723519960449882,
"grad_norm": 0.17075139623058735,
"learning_rate": 1.998980047119906e-05,
"loss": 0.599,
"step": 136
},
{
"epoch": 0.06772957607217897,
"grad_norm": 0.19947430016309822,
"learning_rate": 1.998962388981178e-05,
"loss": 0.5688,
"step": 137
},
{
"epoch": 0.0682239525398591,
"grad_norm": 0.16747509271626193,
"learning_rate": 1.998944579375772e-05,
"loss": 0.61,
"step": 138
},
{
"epoch": 0.06871832900753924,
"grad_norm": 0.18672641821370878,
"learning_rate": 1.9989266183063897e-05,
"loss": 0.575,
"step": 139
},
{
"epoch": 0.06921270547521938,
"grad_norm": 0.15327082040887152,
"learning_rate": 1.998908505775754e-05,
"loss": 0.5643,
"step": 140
},
{
"epoch": 0.06970708194289951,
"grad_norm": 0.20601802630879618,
"learning_rate": 1.9988902417866106e-05,
"loss": 0.5849,
"step": 141
},
{
"epoch": 0.07020145841057966,
"grad_norm": 0.1519792147147535,
"learning_rate": 1.99887182634173e-05,
"loss": 0.5895,
"step": 142
},
{
"epoch": 0.0706958348782598,
"grad_norm": 0.18014404646578858,
"learning_rate": 1.998853259443903e-05,
"loss": 0.6007,
"step": 143
},
{
"epoch": 0.07119021134593993,
"grad_norm": 0.1325415672465801,
"learning_rate": 1.9988345410959457e-05,
"loss": 0.5569,
"step": 144
},
{
"epoch": 0.07168458781362007,
"grad_norm": 0.17742351496714065,
"learning_rate": 1.998815671300696e-05,
"loss": 0.5596,
"step": 145
},
{
"epoch": 0.0721789642813002,
"grad_norm": 0.13302393805021162,
"learning_rate": 1.9987966500610156e-05,
"loss": 0.563,
"step": 146
},
{
"epoch": 0.07267334074898035,
"grad_norm": 0.17633674333660287,
"learning_rate": 1.9987774773797873e-05,
"loss": 0.6032,
"step": 147
},
{
"epoch": 0.07316771721666049,
"grad_norm": 0.12651424798987962,
"learning_rate": 1.998758153259919e-05,
"loss": 0.5423,
"step": 148
},
{
"epoch": 0.07366209368434062,
"grad_norm": 0.16584356722956609,
"learning_rate": 1.9987386777043407e-05,
"loss": 0.5937,
"step": 149
},
{
"epoch": 0.07415647015202076,
"grad_norm": 0.5539114141166533,
"learning_rate": 1.9987190507160052e-05,
"loss": 0.6168,
"step": 150
},
{
"epoch": 0.07465084661970091,
"grad_norm": 0.18056118432595455,
"learning_rate": 1.9986992722978882e-05,
"loss": 0.58,
"step": 151
},
{
"epoch": 0.07514522308738104,
"grad_norm": 0.1605876349265529,
"learning_rate": 1.9986793424529895e-05,
"loss": 0.5745,
"step": 152
},
{
"epoch": 0.07563959955506118,
"grad_norm": 0.2895444965813843,
"learning_rate": 1.9986592611843293e-05,
"loss": 0.6057,
"step": 153
},
{
"epoch": 0.07613397602274131,
"grad_norm": 0.16825732505153987,
"learning_rate": 1.998639028494954e-05,
"loss": 0.597,
"step": 154
},
{
"epoch": 0.07662835249042145,
"grad_norm": 0.1608909060837544,
"learning_rate": 1.998618644387931e-05,
"loss": 0.5745,
"step": 155
},
{
"epoch": 0.0771227289581016,
"grad_norm": 0.15229623275016674,
"learning_rate": 1.99859810886635e-05,
"loss": 0.5835,
"step": 156
},
{
"epoch": 0.07761710542578173,
"grad_norm": 0.16828101696201042,
"learning_rate": 1.998577421933326e-05,
"loss": 0.5664,
"step": 157
},
{
"epoch": 0.07811148189346187,
"grad_norm": 0.14767223439309024,
"learning_rate": 1.9985565835919948e-05,
"loss": 0.5529,
"step": 158
},
{
"epoch": 0.078605858361142,
"grad_norm": 0.14656496453382736,
"learning_rate": 1.998535593845516e-05,
"loss": 0.6123,
"step": 159
},
{
"epoch": 0.07910023482882214,
"grad_norm": 0.13421194491707011,
"learning_rate": 1.998514452697073e-05,
"loss": 0.5544,
"step": 160
},
{
"epoch": 0.07959461129650229,
"grad_norm": 0.1565487972921777,
"learning_rate": 1.9984931601498703e-05,
"loss": 0.5622,
"step": 161
},
{
"epoch": 0.08008898776418243,
"grad_norm": 0.18335120675196886,
"learning_rate": 1.9984717162071367e-05,
"loss": 0.5793,
"step": 162
},
{
"epoch": 0.08058336423186256,
"grad_norm": 0.13924395608906595,
"learning_rate": 1.9984501208721242e-05,
"loss": 0.5735,
"step": 163
},
{
"epoch": 0.0810777406995427,
"grad_norm": 0.13606708490947586,
"learning_rate": 1.998428374148106e-05,
"loss": 0.5202,
"step": 164
},
{
"epoch": 0.08157211716722285,
"grad_norm": 0.1427146106536908,
"learning_rate": 1.9984064760383807e-05,
"loss": 0.5822,
"step": 165
},
{
"epoch": 0.08206649363490298,
"grad_norm": 0.14020092794826075,
"learning_rate": 1.9983844265462674e-05,
"loss": 0.5691,
"step": 166
},
{
"epoch": 0.08256087010258312,
"grad_norm": 0.13100899137455282,
"learning_rate": 1.9983622256751105e-05,
"loss": 0.5649,
"step": 167
},
{
"epoch": 0.08305524657026325,
"grad_norm": 0.13870467676096895,
"learning_rate": 1.9983398734282752e-05,
"loss": 0.5558,
"step": 168
},
{
"epoch": 0.08354962303794339,
"grad_norm": 0.20141873764576138,
"learning_rate": 1.9983173698091512e-05,
"loss": 0.5873,
"step": 169
},
{
"epoch": 0.08404399950562354,
"grad_norm": 0.1566841037267247,
"learning_rate": 1.99829471482115e-05,
"loss": 0.5715,
"step": 170
},
{
"epoch": 0.08453837597330367,
"grad_norm": 0.14752419760529964,
"learning_rate": 1.9982719084677077e-05,
"loss": 0.581,
"step": 171
},
{
"epoch": 0.08503275244098381,
"grad_norm": 0.15293034049272278,
"learning_rate": 1.9982489507522813e-05,
"loss": 0.5591,
"step": 172
},
{
"epoch": 0.08552712890866394,
"grad_norm": 0.2935137711290884,
"learning_rate": 1.998225841678352e-05,
"loss": 0.5828,
"step": 173
},
{
"epoch": 0.08602150537634409,
"grad_norm": 0.1517863782087887,
"learning_rate": 1.9982025812494238e-05,
"loss": 0.5722,
"step": 174
},
{
"epoch": 0.08651588184402423,
"grad_norm": 0.13408349247652207,
"learning_rate": 1.9981791694690237e-05,
"loss": 0.5503,
"step": 175
},
{
"epoch": 0.08701025831170436,
"grad_norm": 0.16703267190626994,
"learning_rate": 1.998155606340701e-05,
"loss": 0.5841,
"step": 176
},
{
"epoch": 0.0875046347793845,
"grad_norm": 0.14459627767555425,
"learning_rate": 1.998131891868029e-05,
"loss": 0.6034,
"step": 177
},
{
"epoch": 0.08799901124706463,
"grad_norm": 0.16637493327473699,
"learning_rate": 1.998108026054603e-05,
"loss": 0.5683,
"step": 178
},
{
"epoch": 0.08849338771474478,
"grad_norm": 0.143552607123043,
"learning_rate": 1.9980840089040415e-05,
"loss": 0.5538,
"step": 179
},
{
"epoch": 0.08898776418242492,
"grad_norm": 0.1441894683922219,
"learning_rate": 1.9980598404199868e-05,
"loss": 0.561,
"step": 180
},
{
"epoch": 0.08948214065010505,
"grad_norm": 0.18146569364990972,
"learning_rate": 1.9980355206061025e-05,
"loss": 0.5815,
"step": 181
},
{
"epoch": 0.08997651711778519,
"grad_norm": 0.15667594759038228,
"learning_rate": 1.9980110494660773e-05,
"loss": 0.5646,
"step": 182
},
{
"epoch": 0.09047089358546533,
"grad_norm": 0.14650210044115178,
"learning_rate": 1.99798642700362e-05,
"loss": 0.5658,
"step": 183
},
{
"epoch": 0.09096527005314547,
"grad_norm": 0.22662863812451683,
"learning_rate": 1.997961653222465e-05,
"loss": 0.5765,
"step": 184
},
{
"epoch": 0.09145964652082561,
"grad_norm": 0.1357586525919655,
"learning_rate": 1.9979367281263684e-05,
"loss": 0.5569,
"step": 185
},
{
"epoch": 0.09195402298850575,
"grad_norm": 0.14342840265672718,
"learning_rate": 1.9979116517191094e-05,
"loss": 0.5829,
"step": 186
},
{
"epoch": 0.09244839945618588,
"grad_norm": 0.14179344645385553,
"learning_rate": 1.9978864240044903e-05,
"loss": 0.5571,
"step": 187
},
{
"epoch": 0.09294277592386603,
"grad_norm": 0.15417641510206428,
"learning_rate": 1.9978610449863362e-05,
"loss": 0.5541,
"step": 188
},
{
"epoch": 0.09343715239154617,
"grad_norm": 0.1400741455003795,
"learning_rate": 1.997835514668495e-05,
"loss": 0.5497,
"step": 189
},
{
"epoch": 0.0939315288592263,
"grad_norm": 0.1467234644472966,
"learning_rate": 1.997809833054838e-05,
"loss": 0.5759,
"step": 190
},
{
"epoch": 0.09442590532690644,
"grad_norm": 0.14595243524368143,
"learning_rate": 1.9977840001492587e-05,
"loss": 0.5966,
"step": 191
},
{
"epoch": 0.09492028179458657,
"grad_norm": 0.15123182280378245,
"learning_rate": 1.9977580159556743e-05,
"loss": 0.5587,
"step": 192
},
{
"epoch": 0.09541465826226672,
"grad_norm": 0.1348602493012784,
"learning_rate": 1.9977318804780245e-05,
"loss": 0.548,
"step": 193
},
{
"epoch": 0.09590903472994686,
"grad_norm": 0.13753240435960756,
"learning_rate": 1.9977055937202724e-05,
"loss": 0.577,
"step": 194
},
{
"epoch": 0.09640341119762699,
"grad_norm": 0.14550942402527817,
"learning_rate": 1.9976791556864034e-05,
"loss": 0.5355,
"step": 195
},
{
"epoch": 0.09689778766530713,
"grad_norm": 0.14814948019504814,
"learning_rate": 1.9976525663804257e-05,
"loss": 0.5833,
"step": 196
},
{
"epoch": 0.09739216413298726,
"grad_norm": 0.1438487184249872,
"learning_rate": 1.997625825806372e-05,
"loss": 0.5326,
"step": 197
},
{
"epoch": 0.09788654060066741,
"grad_norm": 0.13083794473742966,
"learning_rate": 1.997598933968296e-05,
"loss": 0.5604,
"step": 198
},
{
"epoch": 0.09838091706834755,
"grad_norm": 0.14311660750658542,
"learning_rate": 1.997571890870275e-05,
"loss": 0.5798,
"step": 199
},
{
"epoch": 0.09887529353602768,
"grad_norm": 0.13939287992459576,
"learning_rate": 1.9975446965164104e-05,
"loss": 0.5655,
"step": 200
},
{
"epoch": 0.09936967000370782,
"grad_norm": 0.1511570890083603,
"learning_rate": 1.9975173509108242e-05,
"loss": 0.5514,
"step": 201
},
{
"epoch": 0.09986404647138797,
"grad_norm": 0.13802878117769124,
"learning_rate": 1.9974898540576636e-05,
"loss": 0.5602,
"step": 202
},
{
"epoch": 0.1003584229390681,
"grad_norm": 0.13949487848360417,
"learning_rate": 1.9974622059610974e-05,
"loss": 0.5487,
"step": 203
},
{
"epoch": 0.10085279940674824,
"grad_norm": 0.1278953382667824,
"learning_rate": 1.997434406625318e-05,
"loss": 0.5684,
"step": 204
},
{
"epoch": 0.10134717587442837,
"grad_norm": 0.13675860673935314,
"learning_rate": 1.9974064560545395e-05,
"loss": 0.5995,
"step": 205
},
{
"epoch": 0.10184155234210851,
"grad_norm": 0.13160058163579522,
"learning_rate": 1.9973783542530012e-05,
"loss": 0.5301,
"step": 206
},
{
"epoch": 0.10233592880978866,
"grad_norm": 0.12722960111739565,
"learning_rate": 1.9973501012249632e-05,
"loss": 0.5686,
"step": 207
},
{
"epoch": 0.1028303052774688,
"grad_norm": 0.14127538159239894,
"learning_rate": 1.9973216969747097e-05,
"loss": 0.5484,
"step": 208
},
{
"epoch": 0.10332468174514893,
"grad_norm": 0.16352531235131595,
"learning_rate": 1.997293141506547e-05,
"loss": 0.5544,
"step": 209
},
{
"epoch": 0.10381905821282907,
"grad_norm": 0.12560236293990756,
"learning_rate": 1.9972644348248055e-05,
"loss": 0.5606,
"step": 210
},
{
"epoch": 0.1043134346805092,
"grad_norm": 0.1435086097318651,
"learning_rate": 1.997235576933837e-05,
"loss": 0.5594,
"step": 211
},
{
"epoch": 0.10480781114818935,
"grad_norm": 0.13567616067222835,
"learning_rate": 1.997206567838018e-05,
"loss": 0.5621,
"step": 212
},
{
"epoch": 0.10530218761586949,
"grad_norm": 0.1415433961954579,
"learning_rate": 1.9971774075417462e-05,
"loss": 0.566,
"step": 213
},
{
"epoch": 0.10579656408354962,
"grad_norm": 0.13624086144241981,
"learning_rate": 1.9971480960494432e-05,
"loss": 0.5589,
"step": 214
},
{
"epoch": 0.10629094055122976,
"grad_norm": 0.20085312730299076,
"learning_rate": 1.9971186333655536e-05,
"loss": 0.5616,
"step": 215
},
{
"epoch": 0.1067853170189099,
"grad_norm": 0.13826466213017946,
"learning_rate": 1.997089019494544e-05,
"loss": 0.5655,
"step": 216
},
{
"epoch": 0.10727969348659004,
"grad_norm": 0.14672338023847062,
"learning_rate": 1.9970592544409054e-05,
"loss": 0.5685,
"step": 217
},
{
"epoch": 0.10777406995427018,
"grad_norm": 0.13411996415966485,
"learning_rate": 1.9970293382091502e-05,
"loss": 0.5955,
"step": 218
},
{
"epoch": 0.10826844642195031,
"grad_norm": 0.14255853133563415,
"learning_rate": 1.996999270803815e-05,
"loss": 0.5714,
"step": 219
},
{
"epoch": 0.10876282288963045,
"grad_norm": 0.1440745719060979,
"learning_rate": 1.996969052229458e-05,
"loss": 0.5225,
"step": 220
},
{
"epoch": 0.1092571993573106,
"grad_norm": 0.15988706071379843,
"learning_rate": 1.996938682490662e-05,
"loss": 0.5676,
"step": 221
},
{
"epoch": 0.10975157582499073,
"grad_norm": 0.14585871330536193,
"learning_rate": 1.9969081615920312e-05,
"loss": 0.5842,
"step": 222
},
{
"epoch": 0.11024595229267087,
"grad_norm": 0.14373627269749029,
"learning_rate": 1.9968774895381933e-05,
"loss": 0.573,
"step": 223
},
{
"epoch": 0.110740328760351,
"grad_norm": 0.16480665746109965,
"learning_rate": 1.996846666333799e-05,
"loss": 0.5793,
"step": 224
},
{
"epoch": 0.11123470522803114,
"grad_norm": 0.17553401533136342,
"learning_rate": 1.996815691983522e-05,
"loss": 0.5881,
"step": 225
},
{
"epoch": 0.11172908169571129,
"grad_norm": 0.17218807335015235,
"learning_rate": 1.9967845664920584e-05,
"loss": 0.5387,
"step": 226
},
{
"epoch": 0.11222345816339142,
"grad_norm": 0.15322144810671637,
"learning_rate": 1.996753289864128e-05,
"loss": 0.562,
"step": 227
},
{
"epoch": 0.11271783463107156,
"grad_norm": 0.15907520514137446,
"learning_rate": 1.996721862104473e-05,
"loss": 0.5583,
"step": 228
},
{
"epoch": 0.1132122110987517,
"grad_norm": 0.14644463388804965,
"learning_rate": 1.996690283217858e-05,
"loss": 0.5358,
"step": 229
},
{
"epoch": 0.11370658756643184,
"grad_norm": 0.17295403904221937,
"learning_rate": 1.9966585532090717e-05,
"loss": 0.5679,
"step": 230
},
{
"epoch": 0.11420096403411198,
"grad_norm": 0.15154162252713482,
"learning_rate": 1.9966266720829256e-05,
"loss": 0.5256,
"step": 231
},
{
"epoch": 0.11469534050179211,
"grad_norm": 0.16607520511388832,
"learning_rate": 1.9965946398442524e-05,
"loss": 0.5477,
"step": 232
},
{
"epoch": 0.11518971696947225,
"grad_norm": 0.15388707740256324,
"learning_rate": 1.9965624564979097e-05,
"loss": 0.5779,
"step": 233
},
{
"epoch": 0.11568409343715239,
"grad_norm": 0.17756446037091575,
"learning_rate": 1.9965301220487775e-05,
"loss": 0.5569,
"step": 234
},
{
"epoch": 0.11617846990483253,
"grad_norm": 0.14489232855678777,
"learning_rate": 1.996497636501758e-05,
"loss": 0.5348,
"step": 235
},
{
"epoch": 0.11667284637251267,
"grad_norm": 0.1571233197216102,
"learning_rate": 1.996464999861777e-05,
"loss": 0.5564,
"step": 236
},
{
"epoch": 0.1171672228401928,
"grad_norm": 0.15426559588030855,
"learning_rate": 1.996432212133783e-05,
"loss": 0.5394,
"step": 237
},
{
"epoch": 0.11766159930787294,
"grad_norm": 0.14016851529222546,
"learning_rate": 1.9963992733227473e-05,
"loss": 0.5464,
"step": 238
},
{
"epoch": 0.11815597577555309,
"grad_norm": 0.14139134273819806,
"learning_rate": 1.996366183433664e-05,
"loss": 0.5615,
"step": 239
},
{
"epoch": 0.11865035224323323,
"grad_norm": 0.14673562252471375,
"learning_rate": 1.996332942471551e-05,
"loss": 0.5992,
"step": 240
},
{
"epoch": 0.11914472871091336,
"grad_norm": 0.1420337648728369,
"learning_rate": 1.996299550441448e-05,
"loss": 0.565,
"step": 241
},
{
"epoch": 0.1196391051785935,
"grad_norm": 0.1417517867858186,
"learning_rate": 1.996266007348418e-05,
"loss": 0.5264,
"step": 242
},
{
"epoch": 0.12013348164627363,
"grad_norm": 0.1362629445986696,
"learning_rate": 1.996232313197547e-05,
"loss": 0.5549,
"step": 243
},
{
"epoch": 0.12062785811395378,
"grad_norm": 0.152508765696342,
"learning_rate": 1.9961984679939438e-05,
"loss": 0.5282,
"step": 244
},
{
"epoch": 0.12112223458163392,
"grad_norm": 0.12528630185330356,
"learning_rate": 1.9961644717427405e-05,
"loss": 0.5387,
"step": 245
},
{
"epoch": 0.12161661104931405,
"grad_norm": 0.14981151811880533,
"learning_rate": 1.996130324449091e-05,
"loss": 0.5461,
"step": 246
},
{
"epoch": 0.12211098751699419,
"grad_norm": 0.13663136293438152,
"learning_rate": 1.9960960261181733e-05,
"loss": 0.5673,
"step": 247
},
{
"epoch": 0.12260536398467432,
"grad_norm": 0.13205420245643135,
"learning_rate": 1.996061576755188e-05,
"loss": 0.5455,
"step": 248
},
{
"epoch": 0.12309974045235447,
"grad_norm": 0.13668652633526987,
"learning_rate": 1.996026976365358e-05,
"loss": 0.5237,
"step": 249
},
{
"epoch": 0.12359411692003461,
"grad_norm": 0.16452031824825916,
"learning_rate": 1.9959922249539303e-05,
"loss": 0.6007,
"step": 250
},
{
"epoch": 0.12408849338771474,
"grad_norm": 0.14751886797509753,
"learning_rate": 1.995957322526173e-05,
"loss": 0.5562,
"step": 251
},
{
"epoch": 0.12458286985539488,
"grad_norm": 0.13944542821669048,
"learning_rate": 1.9959222690873794e-05,
"loss": 0.5392,
"step": 252
},
{
"epoch": 0.12507724632307501,
"grad_norm": 0.13703392068868694,
"learning_rate": 1.9958870646428634e-05,
"loss": 0.534,
"step": 253
},
{
"epoch": 0.12557162279075515,
"grad_norm": 0.15694449197236257,
"learning_rate": 1.995851709197963e-05,
"loss": 0.5279,
"step": 254
},
{
"epoch": 0.12606599925843529,
"grad_norm": 0.15402296953177877,
"learning_rate": 1.9958162027580396e-05,
"loss": 0.5303,
"step": 255
},
{
"epoch": 0.12656037572611545,
"grad_norm": 0.15051803903838692,
"learning_rate": 1.9957805453284763e-05,
"loss": 0.5416,
"step": 256
},
{
"epoch": 0.12705475219379558,
"grad_norm": 0.16435874923817329,
"learning_rate": 1.9957447369146792e-05,
"loss": 0.5959,
"step": 257
},
{
"epoch": 0.12754912866147572,
"grad_norm": 0.13344314329826207,
"learning_rate": 1.995708777522079e-05,
"loss": 0.5478,
"step": 258
},
{
"epoch": 0.12804350512915585,
"grad_norm": 0.15436092895578277,
"learning_rate": 1.995672667156127e-05,
"loss": 0.5578,
"step": 259
},
{
"epoch": 0.128537881596836,
"grad_norm": 0.14678932275857312,
"learning_rate": 1.995636405822298e-05,
"loss": 0.5691,
"step": 260
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.15782120420531398,
"learning_rate": 1.9955999935260913e-05,
"loss": 0.5577,
"step": 261
},
{
"epoch": 0.12952663453219626,
"grad_norm": 0.15624701951113734,
"learning_rate": 1.995563430273027e-05,
"loss": 0.5211,
"step": 262
},
{
"epoch": 0.1300210109998764,
"grad_norm": 0.13567939001651347,
"learning_rate": 1.9955267160686492e-05,
"loss": 0.5522,
"step": 263
},
{
"epoch": 0.13051538746755653,
"grad_norm": 0.1511753099952918,
"learning_rate": 1.995489850918525e-05,
"loss": 0.552,
"step": 264
},
{
"epoch": 0.1310097639352367,
"grad_norm": 0.12555889405646392,
"learning_rate": 1.9954528348282435e-05,
"loss": 0.558,
"step": 265
},
{
"epoch": 0.13150414040291683,
"grad_norm": 0.14159780687220566,
"learning_rate": 1.9954156678034176e-05,
"loss": 0.5402,
"step": 266
},
{
"epoch": 0.13199851687059697,
"grad_norm": 0.1351057147982322,
"learning_rate": 1.9953783498496825e-05,
"loss": 0.5408,
"step": 267
},
{
"epoch": 0.1324928933382771,
"grad_norm": 0.12986591661531477,
"learning_rate": 1.995340880972697e-05,
"loss": 0.5232,
"step": 268
},
{
"epoch": 0.13298726980595724,
"grad_norm": 0.1402487131956536,
"learning_rate": 1.9953032611781412e-05,
"loss": 0.5743,
"step": 269
},
{
"epoch": 0.13348164627363737,
"grad_norm": 0.14107326852236063,
"learning_rate": 1.9952654904717203e-05,
"loss": 0.6261,
"step": 270
},
{
"epoch": 0.1339760227413175,
"grad_norm": 0.1466024230836744,
"learning_rate": 1.9952275688591606e-05,
"loss": 0.5713,
"step": 271
},
{
"epoch": 0.13447039920899764,
"grad_norm": 0.13195039427747016,
"learning_rate": 1.995189496346212e-05,
"loss": 0.5423,
"step": 272
},
{
"epoch": 0.13496477567667778,
"grad_norm": 0.15528500325117436,
"learning_rate": 1.9951512729386474e-05,
"loss": 0.5516,
"step": 273
},
{
"epoch": 0.13545915214435794,
"grad_norm": 0.14002850817531884,
"learning_rate": 1.9951128986422623e-05,
"loss": 0.5573,
"step": 274
},
{
"epoch": 0.13595352861203808,
"grad_norm": 0.1356507313384556,
"learning_rate": 1.9950743734628754e-05,
"loss": 0.5497,
"step": 275
},
{
"epoch": 0.1364479050797182,
"grad_norm": 0.12828146501466803,
"learning_rate": 1.9950356974063272e-05,
"loss": 0.5423,
"step": 276
},
{
"epoch": 0.13694228154739835,
"grad_norm": 0.14583605358112986,
"learning_rate": 1.994996870478483e-05,
"loss": 0.5364,
"step": 277
},
{
"epoch": 0.13743665801507848,
"grad_norm": 0.13302389113741286,
"learning_rate": 1.9949578926852293e-05,
"loss": 0.5496,
"step": 278
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.13041265618944445,
"learning_rate": 1.994918764032476e-05,
"loss": 0.5491,
"step": 279
},
{
"epoch": 0.13842541095043875,
"grad_norm": 0.15864562967556367,
"learning_rate": 1.9948794845261562e-05,
"loss": 0.5732,
"step": 280
},
{
"epoch": 0.1389197874181189,
"grad_norm": 0.13028417728753308,
"learning_rate": 1.994840054172226e-05,
"loss": 0.5307,
"step": 281
},
{
"epoch": 0.13941416388579903,
"grad_norm": 0.16623638686336492,
"learning_rate": 1.994800472976663e-05,
"loss": 0.5492,
"step": 282
},
{
"epoch": 0.1399085403534792,
"grad_norm": 0.12886464830115654,
"learning_rate": 1.994760740945469e-05,
"loss": 0.5495,
"step": 283
},
{
"epoch": 0.14040291682115932,
"grad_norm": 0.14856231234278064,
"learning_rate": 1.9947208580846694e-05,
"loss": 0.5709,
"step": 284
},
{
"epoch": 0.14089729328883946,
"grad_norm": 0.15497356519197575,
"learning_rate": 1.9946808244003096e-05,
"loss": 0.5311,
"step": 285
},
{
"epoch": 0.1413916697565196,
"grad_norm": 0.14336039590308394,
"learning_rate": 1.994640639898461e-05,
"loss": 0.5639,
"step": 286
},
{
"epoch": 0.14188604622419973,
"grad_norm": 0.15242745680692615,
"learning_rate": 1.994600304585216e-05,
"loss": 0.5629,
"step": 287
},
{
"epoch": 0.14238042269187987,
"grad_norm": 0.15048468939614001,
"learning_rate": 1.9945598184666907e-05,
"loss": 0.5619,
"step": 288
},
{
"epoch": 0.14287479915956,
"grad_norm": 0.14431617178115808,
"learning_rate": 1.9945191815490235e-05,
"loss": 0.5777,
"step": 289
},
{
"epoch": 0.14336917562724014,
"grad_norm": 0.15830772612740787,
"learning_rate": 1.994478393838376e-05,
"loss": 0.5366,
"step": 290
},
{
"epoch": 0.14386355209492027,
"grad_norm": 0.1444984021364644,
"learning_rate": 1.9944374553409326e-05,
"loss": 0.5354,
"step": 291
},
{
"epoch": 0.1443579285626004,
"grad_norm": 0.15514652495434997,
"learning_rate": 1.9943963660629008e-05,
"loss": 0.5769,
"step": 292
},
{
"epoch": 0.14485230503028057,
"grad_norm": 0.14913476502811546,
"learning_rate": 1.99435512601051e-05,
"loss": 0.5344,
"step": 293
},
{
"epoch": 0.1453466814979607,
"grad_norm": 0.13019653862465724,
"learning_rate": 1.9943137351900143e-05,
"loss": 0.549,
"step": 294
},
{
"epoch": 0.14584105796564084,
"grad_norm": 0.14942863646466825,
"learning_rate": 1.9942721936076885e-05,
"loss": 0.5411,
"step": 295
},
{
"epoch": 0.14633543443332098,
"grad_norm": 0.30522668076240705,
"learning_rate": 1.994230501269832e-05,
"loss": 0.5627,
"step": 296
},
{
"epoch": 0.1468298109010011,
"grad_norm": 0.13679480997028864,
"learning_rate": 1.9941886581827658e-05,
"loss": 0.5493,
"step": 297
},
{
"epoch": 0.14732418736868125,
"grad_norm": 0.1424922789927107,
"learning_rate": 1.9941466643528348e-05,
"loss": 0.5543,
"step": 298
},
{
"epoch": 0.14781856383636138,
"grad_norm": 0.1440374200850753,
"learning_rate": 1.9941045197864058e-05,
"loss": 0.5465,
"step": 299
},
{
"epoch": 0.14831294030404152,
"grad_norm": 0.152963594388981,
"learning_rate": 1.9940622244898696e-05,
"loss": 0.5959,
"step": 300
},
{
"epoch": 0.14880731677172165,
"grad_norm": 0.13437944967367274,
"learning_rate": 1.9940197784696385e-05,
"loss": 0.5648,
"step": 301
},
{
"epoch": 0.14930169323940182,
"grad_norm": 0.13350170521700735,
"learning_rate": 1.9939771817321484e-05,
"loss": 0.5379,
"step": 302
},
{
"epoch": 0.14979606970708195,
"grad_norm": 0.14763590757552225,
"learning_rate": 1.9939344342838585e-05,
"loss": 0.5327,
"step": 303
},
{
"epoch": 0.1502904461747621,
"grad_norm": 0.13383740426098156,
"learning_rate": 1.99389153613125e-05,
"loss": 0.5698,
"step": 304
},
{
"epoch": 0.15078482264244222,
"grad_norm": 0.13673523719975234,
"learning_rate": 1.9938484872808274e-05,
"loss": 0.5577,
"step": 305
},
{
"epoch": 0.15127919911012236,
"grad_norm": 0.14238036085570663,
"learning_rate": 1.9938052877391177e-05,
"loss": 0.5351,
"step": 306
},
{
"epoch": 0.1517735755778025,
"grad_norm": 0.13960683244740305,
"learning_rate": 1.9937619375126714e-05,
"loss": 0.5414,
"step": 307
},
{
"epoch": 0.15226795204548263,
"grad_norm": 0.16860705364868128,
"learning_rate": 1.9937184366080612e-05,
"loss": 0.5518,
"step": 308
},
{
"epoch": 0.15276232851316277,
"grad_norm": 0.13267098489221013,
"learning_rate": 1.9936747850318826e-05,
"loss": 0.5116,
"step": 309
},
{
"epoch": 0.1532567049808429,
"grad_norm": 0.15296297417333332,
"learning_rate": 1.993630982790755e-05,
"loss": 0.5462,
"step": 310
},
{
"epoch": 0.15375108144852306,
"grad_norm": 0.14404943980403798,
"learning_rate": 1.993587029891319e-05,
"loss": 0.561,
"step": 311
},
{
"epoch": 0.1542454579162032,
"grad_norm": 0.12695037045346788,
"learning_rate": 1.993542926340239e-05,
"loss": 0.5123,
"step": 312
},
{
"epoch": 0.15473983438388333,
"grad_norm": 0.14261785213479566,
"learning_rate": 1.9934986721442027e-05,
"loss": 0.5732,
"step": 313
},
{
"epoch": 0.15523421085156347,
"grad_norm": 0.13313171316168806,
"learning_rate": 1.99345426730992e-05,
"loss": 0.571,
"step": 314
},
{
"epoch": 0.1557285873192436,
"grad_norm": 0.1450343503395449,
"learning_rate": 1.9934097118441235e-05,
"loss": 0.5528,
"step": 315
},
{
"epoch": 0.15622296378692374,
"grad_norm": 0.1354405849923585,
"learning_rate": 1.9933650057535687e-05,
"loss": 0.5386,
"step": 316
},
{
"epoch": 0.15671734025460388,
"grad_norm": 0.1329716708165765,
"learning_rate": 1.9933201490450346e-05,
"loss": 0.5557,
"step": 317
},
{
"epoch": 0.157211716722284,
"grad_norm": 0.22899589707658533,
"learning_rate": 1.9932751417253223e-05,
"loss": 0.593,
"step": 318
},
{
"epoch": 0.15770609318996415,
"grad_norm": 0.12207099260590105,
"learning_rate": 1.993229983801256e-05,
"loss": 0.5418,
"step": 319
},
{
"epoch": 0.15820046965764428,
"grad_norm": 0.15679368487501968,
"learning_rate": 1.993184675279683e-05,
"loss": 0.5417,
"step": 320
},
{
"epoch": 0.15869484612532445,
"grad_norm": 0.12949771443844627,
"learning_rate": 1.993139216167473e-05,
"loss": 0.5559,
"step": 321
},
{
"epoch": 0.15918922259300458,
"grad_norm": 0.13528336218703266,
"learning_rate": 1.993093606471518e-05,
"loss": 0.5301,
"step": 322
},
{
"epoch": 0.15968359906068472,
"grad_norm": 0.13396849115010456,
"learning_rate": 1.9930478461987343e-05,
"loss": 0.5506,
"step": 323
},
{
"epoch": 0.16017797552836485,
"grad_norm": 0.12952612630207255,
"learning_rate": 1.9930019353560605e-05,
"loss": 0.5561,
"step": 324
},
{
"epoch": 0.160672351996045,
"grad_norm": 0.15258460392887746,
"learning_rate": 1.992955873950457e-05,
"loss": 0.512,
"step": 325
},
{
"epoch": 0.16116672846372512,
"grad_norm": 0.13016060646892047,
"learning_rate": 1.992909661988908e-05,
"loss": 0.5468,
"step": 326
},
{
"epoch": 0.16166110493140526,
"grad_norm": 0.13970553806982206,
"learning_rate": 1.9928632994784206e-05,
"loss": 0.5368,
"step": 327
},
{
"epoch": 0.1621554813990854,
"grad_norm": 0.14405650206109016,
"learning_rate": 1.992816786426025e-05,
"loss": 0.5712,
"step": 328
},
{
"epoch": 0.16264985786676553,
"grad_norm": 0.14749244914118798,
"learning_rate": 1.9927701228387725e-05,
"loss": 0.5641,
"step": 329
},
{
"epoch": 0.1631442343344457,
"grad_norm": 0.152614987085935,
"learning_rate": 1.992723308723739e-05,
"loss": 0.5374,
"step": 330
},
{
"epoch": 0.16363861080212583,
"grad_norm": 0.1327084422521158,
"learning_rate": 1.9926763440880228e-05,
"loss": 0.5213,
"step": 331
},
{
"epoch": 0.16413298726980596,
"grad_norm": 0.16111996098948755,
"learning_rate": 1.992629228938745e-05,
"loss": 0.5771,
"step": 332
},
{
"epoch": 0.1646273637374861,
"grad_norm": 0.13015636221825003,
"learning_rate": 1.9925819632830485e-05,
"loss": 0.544,
"step": 333
},
{
"epoch": 0.16512174020516623,
"grad_norm": 0.1371079587227566,
"learning_rate": 1.9925345471281007e-05,
"loss": 0.5547,
"step": 334
},
{
"epoch": 0.16561611667284637,
"grad_norm": 0.13502426283957789,
"learning_rate": 1.992486980481091e-05,
"loss": 0.5686,
"step": 335
},
{
"epoch": 0.1661104931405265,
"grad_norm": 0.14520057336083325,
"learning_rate": 1.9924392633492316e-05,
"loss": 0.5284,
"step": 336
},
{
"epoch": 0.16660486960820664,
"grad_norm": 0.1250434851697977,
"learning_rate": 1.9923913957397572e-05,
"loss": 0.5349,
"step": 337
},
{
"epoch": 0.16709924607588678,
"grad_norm": 0.1618393373247686,
"learning_rate": 1.992343377659926e-05,
"loss": 0.5877,
"step": 338
},
{
"epoch": 0.16759362254356694,
"grad_norm": 0.12900568294151976,
"learning_rate": 1.9922952091170185e-05,
"loss": 0.5787,
"step": 339
},
{
"epoch": 0.16808799901124707,
"grad_norm": 0.14863670335825885,
"learning_rate": 1.9922468901183384e-05,
"loss": 0.5467,
"step": 340
},
{
"epoch": 0.1685823754789272,
"grad_norm": 0.13253078203304836,
"learning_rate": 1.9921984206712122e-05,
"loss": 0.5439,
"step": 341
},
{
"epoch": 0.16907675194660735,
"grad_norm": 0.1375712252093406,
"learning_rate": 1.9921498007829885e-05,
"loss": 0.5476,
"step": 342
},
{
"epoch": 0.16957112841428748,
"grad_norm": 0.13429673517140214,
"learning_rate": 1.9921010304610397e-05,
"loss": 0.5323,
"step": 343
},
{
"epoch": 0.17006550488196762,
"grad_norm": 0.12678671615755477,
"learning_rate": 1.9920521097127602e-05,
"loss": 0.5067,
"step": 344
},
{
"epoch": 0.17055988134964775,
"grad_norm": 0.13989938052628131,
"learning_rate": 1.9920030385455676e-05,
"loss": 0.5385,
"step": 345
},
{
"epoch": 0.1710542578173279,
"grad_norm": 0.1384192258619826,
"learning_rate": 1.991953816966903e-05,
"loss": 0.5239,
"step": 346
},
{
"epoch": 0.17154863428500802,
"grad_norm": 0.13261827014959274,
"learning_rate": 1.9919044449842285e-05,
"loss": 0.6021,
"step": 347
},
{
"epoch": 0.17204301075268819,
"grad_norm": 0.1330121909330407,
"learning_rate": 1.9918549226050305e-05,
"loss": 0.5569,
"step": 348
},
{
"epoch": 0.17253738722036832,
"grad_norm": 0.14122017002731904,
"learning_rate": 1.991805249836818e-05,
"loss": 0.5286,
"step": 349
},
{
"epoch": 0.17303176368804846,
"grad_norm": 0.1368165836281056,
"learning_rate": 1.9917554266871223e-05,
"loss": 0.5528,
"step": 350
},
{
"epoch": 0.1735261401557286,
"grad_norm": 0.13369628298347982,
"learning_rate": 1.991705453163498e-05,
"loss": 0.5365,
"step": 351
},
{
"epoch": 0.17402051662340873,
"grad_norm": 0.1317031184471453,
"learning_rate": 1.991655329273522e-05,
"loss": 0.5474,
"step": 352
},
{
"epoch": 0.17451489309108886,
"grad_norm": 0.12252943773014198,
"learning_rate": 1.9916050550247948e-05,
"loss": 0.5166,
"step": 353
},
{
"epoch": 0.175009269558769,
"grad_norm": 0.1553280848734711,
"learning_rate": 1.9915546304249385e-05,
"loss": 0.5747,
"step": 354
},
{
"epoch": 0.17550364602644913,
"grad_norm": 0.1196199553211048,
"learning_rate": 1.9915040554815994e-05,
"loss": 0.5547,
"step": 355
},
{
"epoch": 0.17599802249412927,
"grad_norm": 0.1267493274585488,
"learning_rate": 1.9914533302024452e-05,
"loss": 0.5442,
"step": 356
},
{
"epoch": 0.1764923989618094,
"grad_norm": 0.12967669382101732,
"learning_rate": 1.9914024545951673e-05,
"loss": 0.5194,
"step": 357
},
{
"epoch": 0.17698677542948957,
"grad_norm": 0.1357178129400203,
"learning_rate": 1.99135142866748e-05,
"loss": 0.5412,
"step": 358
},
{
"epoch": 0.1774811518971697,
"grad_norm": 0.15921125206124473,
"learning_rate": 1.9913002524271198e-05,
"loss": 0.5472,
"step": 359
},
{
"epoch": 0.17797552836484984,
"grad_norm": 0.14621383794833864,
"learning_rate": 1.9912489258818462e-05,
"loss": 0.5176,
"step": 360
},
{
"epoch": 0.17846990483252997,
"grad_norm": 0.14457145345963715,
"learning_rate": 1.9911974490394415e-05,
"loss": 0.5735,
"step": 361
},
{
"epoch": 0.1789642813002101,
"grad_norm": 0.13766881527141944,
"learning_rate": 1.991145821907711e-05,
"loss": 0.558,
"step": 362
},
{
"epoch": 0.17945865776789025,
"grad_norm": 0.12864654841289427,
"learning_rate": 1.9910940444944824e-05,
"loss": 0.5456,
"step": 363
},
{
"epoch": 0.17995303423557038,
"grad_norm": 0.14371887560049537,
"learning_rate": 1.9910421168076066e-05,
"loss": 0.5426,
"step": 364
},
{
"epoch": 0.18044741070325052,
"grad_norm": 0.1341989667376047,
"learning_rate": 1.990990038854957e-05,
"loss": 0.5251,
"step": 365
},
{
"epoch": 0.18094178717093065,
"grad_norm": 0.13052219634048062,
"learning_rate": 1.9909378106444295e-05,
"loss": 0.5061,
"step": 366
},
{
"epoch": 0.18143616363861081,
"grad_norm": 0.1311755577112124,
"learning_rate": 1.990885432183944e-05,
"loss": 0.5405,
"step": 367
},
{
"epoch": 0.18193054010629095,
"grad_norm": 0.12789106041851525,
"learning_rate": 1.9908329034814416e-05,
"loss": 0.5464,
"step": 368
},
{
"epoch": 0.18242491657397109,
"grad_norm": 0.12940262261973695,
"learning_rate": 1.9907802245448876e-05,
"loss": 0.5263,
"step": 369
},
{
"epoch": 0.18291929304165122,
"grad_norm": 0.1361752106908491,
"learning_rate": 1.9907273953822685e-05,
"loss": 0.564,
"step": 370
},
{
"epoch": 0.18341366950933136,
"grad_norm": 0.1286312774210412,
"learning_rate": 1.990674416001595e-05,
"loss": 0.562,
"step": 371
},
{
"epoch": 0.1839080459770115,
"grad_norm": 0.12058855544585058,
"learning_rate": 1.9906212864109e-05,
"loss": 0.5228,
"step": 372
},
{
"epoch": 0.18440242244469163,
"grad_norm": 0.12674518769250442,
"learning_rate": 1.990568006618239e-05,
"loss": 0.542,
"step": 373
},
{
"epoch": 0.18489679891237176,
"grad_norm": 0.1297847380999584,
"learning_rate": 1.990514576631691e-05,
"loss": 0.5345,
"step": 374
},
{
"epoch": 0.1853911753800519,
"grad_norm": 0.13363533357833923,
"learning_rate": 1.990460996459357e-05,
"loss": 0.5691,
"step": 375
},
{
"epoch": 0.18588555184773206,
"grad_norm": 0.13368485214159626,
"learning_rate": 1.9904072661093608e-05,
"loss": 0.5656,
"step": 376
},
{
"epoch": 0.1863799283154122,
"grad_norm": 0.12397239994774494,
"learning_rate": 1.9903533855898493e-05,
"loss": 0.5517,
"step": 377
},
{
"epoch": 0.18687430478309233,
"grad_norm": 0.130584577035665,
"learning_rate": 1.9902993549089924e-05,
"loss": 0.5455,
"step": 378
},
{
"epoch": 0.18736868125077247,
"grad_norm": 0.1265476486699415,
"learning_rate": 1.990245174074982e-05,
"loss": 0.5744,
"step": 379
},
{
"epoch": 0.1878630577184526,
"grad_norm": 0.1243721773945244,
"learning_rate": 1.9901908430960337e-05,
"loss": 0.5452,
"step": 380
},
{
"epoch": 0.18835743418613274,
"grad_norm": 0.12955921530130773,
"learning_rate": 1.990136361980385e-05,
"loss": 0.5502,
"step": 381
},
{
"epoch": 0.18885181065381287,
"grad_norm": 0.12240339883845076,
"learning_rate": 1.9900817307362965e-05,
"loss": 0.5202,
"step": 382
},
{
"epoch": 0.189346187121493,
"grad_norm": 0.12759814790378057,
"learning_rate": 1.990026949372052e-05,
"loss": 0.5251,
"step": 383
},
{
"epoch": 0.18984056358917314,
"grad_norm": 0.13052714263596754,
"learning_rate": 1.9899720178959576e-05,
"loss": 0.5412,
"step": 384
},
{
"epoch": 0.19033494005685328,
"grad_norm": 0.12339192472819296,
"learning_rate": 1.989916936316342e-05,
"loss": 0.5163,
"step": 385
},
{
"epoch": 0.19082931652453344,
"grad_norm": 0.1272156625133985,
"learning_rate": 1.989861704641557e-05,
"loss": 0.5303,
"step": 386
},
{
"epoch": 0.19132369299221358,
"grad_norm": 0.13932241715475036,
"learning_rate": 1.9898063228799764e-05,
"loss": 0.5618,
"step": 387
},
{
"epoch": 0.19181806945989371,
"grad_norm": 0.12979643745512898,
"learning_rate": 1.9897507910399987e-05,
"loss": 0.5163,
"step": 388
},
{
"epoch": 0.19231244592757385,
"grad_norm": 0.12495469550197454,
"learning_rate": 1.989695109130043e-05,
"loss": 0.5821,
"step": 389
},
{
"epoch": 0.19280682239525399,
"grad_norm": 0.13060464068610667,
"learning_rate": 1.9896392771585523e-05,
"loss": 0.563,
"step": 390
},
{
"epoch": 0.19330119886293412,
"grad_norm": 0.13746509836762652,
"learning_rate": 1.9895832951339916e-05,
"loss": 0.5448,
"step": 391
},
{
"epoch": 0.19379557533061426,
"grad_norm": 0.12268182453239863,
"learning_rate": 1.9895271630648497e-05,
"loss": 0.5186,
"step": 392
},
{
"epoch": 0.1942899517982944,
"grad_norm": 0.12743314881809642,
"learning_rate": 1.9894708809596374e-05,
"loss": 0.5331,
"step": 393
},
{
"epoch": 0.19478432826597453,
"grad_norm": 0.14059750989115757,
"learning_rate": 1.9894144488268883e-05,
"loss": 0.5446,
"step": 394
},
{
"epoch": 0.1952787047336547,
"grad_norm": 0.21663759885031403,
"learning_rate": 1.989357866675159e-05,
"loss": 0.6066,
"step": 395
},
{
"epoch": 0.19577308120133483,
"grad_norm": 0.12752064348455566,
"learning_rate": 1.9893011345130287e-05,
"loss": 0.5091,
"step": 396
},
{
"epoch": 0.19626745766901496,
"grad_norm": 2.718715403362128,
"learning_rate": 1.9892442523490994e-05,
"loss": 0.5927,
"step": 397
},
{
"epoch": 0.1967618341366951,
"grad_norm": 0.14737375573336406,
"learning_rate": 1.9891872201919954e-05,
"loss": 0.5286,
"step": 398
},
{
"epoch": 0.19725621060437523,
"grad_norm": 0.17937028956656123,
"learning_rate": 1.9891300380503646e-05,
"loss": 0.5209,
"step": 399
},
{
"epoch": 0.19775058707205537,
"grad_norm": 0.14341785437338112,
"learning_rate": 1.989072705932877e-05,
"loss": 0.583,
"step": 400
},
{
"epoch": 0.1982449635397355,
"grad_norm": 0.13888986793576097,
"learning_rate": 1.9890152238482255e-05,
"loss": 0.5519,
"step": 401
},
{
"epoch": 0.19873934000741564,
"grad_norm": 0.6171305113264809,
"learning_rate": 1.988957591805126e-05,
"loss": 0.53,
"step": 402
},
{
"epoch": 0.19923371647509577,
"grad_norm": 0.13901738231318397,
"learning_rate": 1.9888998098123166e-05,
"loss": 0.5383,
"step": 403
},
{
"epoch": 0.19972809294277594,
"grad_norm": 0.12741913260052368,
"learning_rate": 1.988841877878559e-05,
"loss": 0.5563,
"step": 404
},
{
"epoch": 0.20022246941045607,
"grad_norm": 0.14482548472123413,
"learning_rate": 1.9887837960126358e-05,
"loss": 0.5266,
"step": 405
},
{
"epoch": 0.2007168458781362,
"grad_norm": 0.1401174234364105,
"learning_rate": 1.988725564223355e-05,
"loss": 0.5481,
"step": 406
},
{
"epoch": 0.20121122234581634,
"grad_norm": 0.1330610498847754,
"learning_rate": 1.9886671825195453e-05,
"loss": 0.5094,
"step": 407
},
{
"epoch": 0.20170559881349648,
"grad_norm": 0.12543103292582955,
"learning_rate": 1.9886086509100585e-05,
"loss": 0.521,
"step": 408
},
{
"epoch": 0.2021999752811766,
"grad_norm": 0.1466096667138712,
"learning_rate": 1.98854996940377e-05,
"loss": 0.5548,
"step": 409
},
{
"epoch": 0.20269435174885675,
"grad_norm": 0.12811879118225605,
"learning_rate": 1.9884911380095772e-05,
"loss": 0.5288,
"step": 410
},
{
"epoch": 0.20318872821653688,
"grad_norm": 0.13668391185387485,
"learning_rate": 1.9884321567364003e-05,
"loss": 0.5246,
"step": 411
},
{
"epoch": 0.20368310468421702,
"grad_norm": 0.1376310561405914,
"learning_rate": 1.9883730255931818e-05,
"loss": 0.5004,
"step": 412
},
{
"epoch": 0.20417748115189718,
"grad_norm": 0.1311294429858692,
"learning_rate": 1.988313744588888e-05,
"loss": 0.5512,
"step": 413
},
{
"epoch": 0.20467185761957732,
"grad_norm": 0.13385471097584614,
"learning_rate": 1.9882543137325073e-05,
"loss": 0.5382,
"step": 414
},
{
"epoch": 0.20516623408725745,
"grad_norm": 0.13384698167774212,
"learning_rate": 1.9881947330330505e-05,
"loss": 0.5603,
"step": 415
},
{
"epoch": 0.2056606105549376,
"grad_norm": 0.4066929382010196,
"learning_rate": 1.9881350024995514e-05,
"loss": 0.5627,
"step": 416
},
{
"epoch": 0.20615498702261773,
"grad_norm": 0.12952489088863267,
"learning_rate": 1.9880751221410672e-05,
"loss": 0.564,
"step": 417
},
{
"epoch": 0.20664936349029786,
"grad_norm": 0.14183951501533587,
"learning_rate": 1.9880150919666764e-05,
"loss": 0.5594,
"step": 418
},
{
"epoch": 0.207143739957978,
"grad_norm": 0.1455422877580284,
"learning_rate": 1.987954911985482e-05,
"loss": 0.5556,
"step": 419
},
{
"epoch": 0.20763811642565813,
"grad_norm": 2.6560569561336558,
"learning_rate": 1.987894582206608e-05,
"loss": 0.5447,
"step": 420
},
{
"epoch": 0.20813249289333827,
"grad_norm": 0.1758698121643467,
"learning_rate": 1.9878341026392016e-05,
"loss": 0.5577,
"step": 421
},
{
"epoch": 0.2086268693610184,
"grad_norm": 0.14792063047071205,
"learning_rate": 1.9877734732924335e-05,
"loss": 0.535,
"step": 422
},
{
"epoch": 0.20912124582869857,
"grad_norm": 0.6779385962533373,
"learning_rate": 1.9877126941754966e-05,
"loss": 0.5839,
"step": 423
},
{
"epoch": 0.2096156222963787,
"grad_norm": 0.19624745910793837,
"learning_rate": 1.987651765297606e-05,
"loss": 0.586,
"step": 424
},
{
"epoch": 0.21010999876405884,
"grad_norm": 0.17755675964860526,
"learning_rate": 1.9875906866680002e-05,
"loss": 0.5899,
"step": 425
},
{
"epoch": 0.21060437523173897,
"grad_norm": 0.18166399403640282,
"learning_rate": 1.9875294582959407e-05,
"loss": 0.5297,
"step": 426
},
{
"epoch": 0.2110987516994191,
"grad_norm": 0.16766163507992954,
"learning_rate": 1.9874680801907108e-05,
"loss": 0.5694,
"step": 427
},
{
"epoch": 0.21159312816709924,
"grad_norm": 0.16892617463610174,
"learning_rate": 1.9874065523616165e-05,
"loss": 0.544,
"step": 428
},
{
"epoch": 0.21208750463477938,
"grad_norm": 0.15656847745825397,
"learning_rate": 1.9873448748179872e-05,
"loss": 0.5535,
"step": 429
},
{
"epoch": 0.2125818811024595,
"grad_norm": 0.14871937677373576,
"learning_rate": 1.9872830475691747e-05,
"loss": 0.5648,
"step": 430
},
{
"epoch": 0.21307625757013965,
"grad_norm": 0.17316121961644246,
"learning_rate": 1.9872210706245538e-05,
"loss": 0.5779,
"step": 431
},
{
"epoch": 0.2135706340378198,
"grad_norm": 0.15139853154204733,
"learning_rate": 1.9871589439935212e-05,
"loss": 0.5566,
"step": 432
},
{
"epoch": 0.21406501050549995,
"grad_norm": 0.1639058630359748,
"learning_rate": 1.9870966676854972e-05,
"loss": 0.5329,
"step": 433
},
{
"epoch": 0.21455938697318008,
"grad_norm": 0.13377695378526672,
"learning_rate": 1.9870342417099244e-05,
"loss": 0.5552,
"step": 434
},
{
"epoch": 0.21505376344086022,
"grad_norm": 0.21216018869616357,
"learning_rate": 1.986971666076267e-05,
"loss": 0.5587,
"step": 435
},
{
"epoch": 0.21554813990854035,
"grad_norm": 0.16829644486146256,
"learning_rate": 1.9869089407940147e-05,
"loss": 0.5441,
"step": 436
},
{
"epoch": 0.2160425163762205,
"grad_norm": 0.1758464629538605,
"learning_rate": 1.986846065872677e-05,
"loss": 0.5546,
"step": 437
},
{
"epoch": 0.21653689284390062,
"grad_norm": 0.1350323359517654,
"learning_rate": 1.9867830413217876e-05,
"loss": 0.5628,
"step": 438
},
{
"epoch": 0.21703126931158076,
"grad_norm": 0.14928422048160467,
"learning_rate": 1.986719867150902e-05,
"loss": 0.5418,
"step": 439
},
{
"epoch": 0.2175256457792609,
"grad_norm": 0.1294827610241255,
"learning_rate": 1.9866565433696002e-05,
"loss": 0.5382,
"step": 440
},
{
"epoch": 0.21802002224694106,
"grad_norm": 0.1441935597488807,
"learning_rate": 1.9865930699874824e-05,
"loss": 0.5268,
"step": 441
},
{
"epoch": 0.2185143987146212,
"grad_norm": 0.5818197478594357,
"learning_rate": 1.9865294470141732e-05,
"loss": 0.5375,
"step": 442
},
{
"epoch": 0.21900877518230133,
"grad_norm": 0.1363679553146533,
"learning_rate": 1.9864656744593192e-05,
"loss": 0.5369,
"step": 443
},
{
"epoch": 0.21950315164998146,
"grad_norm": 0.16512756436513895,
"learning_rate": 1.9864017523325898e-05,
"loss": 0.5506,
"step": 444
},
{
"epoch": 0.2199975281176616,
"grad_norm": 0.1351752757444937,
"learning_rate": 1.9863376806436774e-05,
"loss": 0.5631,
"step": 445
},
{
"epoch": 0.22049190458534174,
"grad_norm": 0.1718589037770868,
"learning_rate": 1.9862734594022964e-05,
"loss": 0.5637,
"step": 446
},
{
"epoch": 0.22098628105302187,
"grad_norm": 0.12464924074289992,
"learning_rate": 1.9862090886181845e-05,
"loss": 0.5375,
"step": 447
},
{
"epoch": 0.221480657520702,
"grad_norm": 0.150889883214495,
"learning_rate": 1.9861445683011023e-05,
"loss": 0.5425,
"step": 448
},
{
"epoch": 0.22197503398838214,
"grad_norm": 0.11992913028193727,
"learning_rate": 1.986079898460832e-05,
"loss": 0.5217,
"step": 449
},
{
"epoch": 0.22246941045606228,
"grad_norm": 0.15128995694430558,
"learning_rate": 1.9860150791071794e-05,
"loss": 0.5238,
"step": 450
},
{
"epoch": 0.22296378692374244,
"grad_norm": 0.13512249855393857,
"learning_rate": 1.9859501102499722e-05,
"loss": 0.5111,
"step": 451
},
{
"epoch": 0.22345816339142258,
"grad_norm": 0.13519721013877448,
"learning_rate": 1.985884991899062e-05,
"loss": 0.5143,
"step": 452
},
{
"epoch": 0.2239525398591027,
"grad_norm": 0.13050962503060154,
"learning_rate": 1.985819724064322e-05,
"loss": 0.5385,
"step": 453
},
{
"epoch": 0.22444691632678285,
"grad_norm": 0.15203304265348894,
"learning_rate": 1.9857543067556483e-05,
"loss": 0.5273,
"step": 454
},
{
"epoch": 0.22494129279446298,
"grad_norm": 0.11837749993082794,
"learning_rate": 1.9856887399829594e-05,
"loss": 0.536,
"step": 455
},
{
"epoch": 0.22543566926214312,
"grad_norm": 0.14629035019578973,
"learning_rate": 1.9856230237561974e-05,
"loss": 0.5681,
"step": 456
},
{
"epoch": 0.22593004572982325,
"grad_norm": 0.1221862401370936,
"learning_rate": 1.9855571580853258e-05,
"loss": 0.5341,
"step": 457
},
{
"epoch": 0.2264244221975034,
"grad_norm": 0.1361414966272578,
"learning_rate": 1.9854911429803324e-05,
"loss": 0.5515,
"step": 458
},
{
"epoch": 0.22691879866518352,
"grad_norm": 0.12827681916529327,
"learning_rate": 1.9854249784512257e-05,
"loss": 0.5802,
"step": 459
},
{
"epoch": 0.2274131751328637,
"grad_norm": 0.13261042316001787,
"learning_rate": 1.985358664508038e-05,
"loss": 0.5126,
"step": 460
},
{
"epoch": 0.22790755160054382,
"grad_norm": 0.12955565009065853,
"learning_rate": 1.9852922011608245e-05,
"loss": 0.5063,
"step": 461
},
{
"epoch": 0.22840192806822396,
"grad_norm": 0.12873772291490365,
"learning_rate": 1.9852255884196626e-05,
"loss": 0.5277,
"step": 462
},
{
"epoch": 0.2288963045359041,
"grad_norm": 0.12847049795542467,
"learning_rate": 1.985158826294652e-05,
"loss": 0.5501,
"step": 463
},
{
"epoch": 0.22939068100358423,
"grad_norm": 0.12264272185484301,
"learning_rate": 1.9850919147959158e-05,
"loss": 0.551,
"step": 464
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.13375410473652186,
"learning_rate": 1.985024853933599e-05,
"loss": 0.5726,
"step": 465
},
{
"epoch": 0.2303794339389445,
"grad_norm": 0.1239707256082663,
"learning_rate": 1.9849576437178703e-05,
"loss": 0.5334,
"step": 466
},
{
"epoch": 0.23087381040662464,
"grad_norm": 0.13316816600581774,
"learning_rate": 1.98489028415892e-05,
"loss": 0.5628,
"step": 467
},
{
"epoch": 0.23136818687430477,
"grad_norm": 0.12921458163821353,
"learning_rate": 1.9848227752669612e-05,
"loss": 0.5367,
"step": 468
},
{
"epoch": 0.23186256334198493,
"grad_norm": 0.3340283789335476,
"learning_rate": 1.98475511705223e-05,
"loss": 0.5793,
"step": 469
},
{
"epoch": 0.23235693980966507,
"grad_norm": 0.12796246880785253,
"learning_rate": 1.984687309524985e-05,
"loss": 0.5215,
"step": 470
},
{
"epoch": 0.2328513162773452,
"grad_norm": 0.12580866602435614,
"learning_rate": 1.9846193526955074e-05,
"loss": 0.5291,
"step": 471
},
{
"epoch": 0.23334569274502534,
"grad_norm": 0.1433732128545011,
"learning_rate": 1.9845512465741016e-05,
"loss": 0.5366,
"step": 472
},
{
"epoch": 0.23384006921270548,
"grad_norm": 0.12476421035236517,
"learning_rate": 1.9844829911710936e-05,
"loss": 0.5411,
"step": 473
},
{
"epoch": 0.2343344456803856,
"grad_norm": 0.3164456402293468,
"learning_rate": 1.9844145864968326e-05,
"loss": 0.549,
"step": 474
},
{
"epoch": 0.23482882214806575,
"grad_norm": 0.13019893195373083,
"learning_rate": 1.9843460325616908e-05,
"loss": 0.5566,
"step": 475
},
{
"epoch": 0.23532319861574588,
"grad_norm": 0.12244920773720974,
"learning_rate": 1.984277329376062e-05,
"loss": 0.5414,
"step": 476
},
{
"epoch": 0.23581757508342602,
"grad_norm": 0.13423520084666352,
"learning_rate": 1.9842084769503636e-05,
"loss": 0.5518,
"step": 477
},
{
"epoch": 0.23631195155110618,
"grad_norm": 0.1506427218897423,
"learning_rate": 1.984139475295035e-05,
"loss": 0.5493,
"step": 478
},
{
"epoch": 0.23680632801878632,
"grad_norm": 0.1372190950222617,
"learning_rate": 1.9840703244205392e-05,
"loss": 0.5142,
"step": 479
},
{
"epoch": 0.23730070448646645,
"grad_norm": 0.13209809741629291,
"learning_rate": 1.9840010243373603e-05,
"loss": 0.5761,
"step": 480
},
{
"epoch": 0.2377950809541466,
"grad_norm": 0.1419259852826226,
"learning_rate": 1.9839315750560068e-05,
"loss": 0.5432,
"step": 481
},
{
"epoch": 0.23828945742182672,
"grad_norm": 0.13231490600644383,
"learning_rate": 1.9838619765870076e-05,
"loss": 0.5487,
"step": 482
},
{
"epoch": 0.23878383388950686,
"grad_norm": 0.18475264408773226,
"learning_rate": 1.9837922289409164e-05,
"loss": 0.5552,
"step": 483
},
{
"epoch": 0.239278210357187,
"grad_norm": 0.13598980499797578,
"learning_rate": 1.9837223321283087e-05,
"loss": 0.52,
"step": 484
},
{
"epoch": 0.23977258682486713,
"grad_norm": 0.172720123297476,
"learning_rate": 1.983652286159782e-05,
"loss": 0.5273,
"step": 485
},
{
"epoch": 0.24026696329254726,
"grad_norm": 0.13162936645847328,
"learning_rate": 1.9835820910459573e-05,
"loss": 0.516,
"step": 486
},
{
"epoch": 0.2407613397602274,
"grad_norm": 0.13110961058769308,
"learning_rate": 1.983511746797478e-05,
"loss": 0.535,
"step": 487
},
{
"epoch": 0.24125571622790756,
"grad_norm": 0.13250809770854105,
"learning_rate": 1.983441253425009e-05,
"loss": 0.5406,
"step": 488
},
{
"epoch": 0.2417500926955877,
"grad_norm": 0.1252388012432334,
"learning_rate": 1.9833706109392404e-05,
"loss": 0.5134,
"step": 489
},
{
"epoch": 0.24224446916326783,
"grad_norm": 0.1338195341642124,
"learning_rate": 1.983299819350882e-05,
"loss": 0.5605,
"step": 490
},
{
"epoch": 0.24273884563094797,
"grad_norm": 0.1363904239819599,
"learning_rate": 1.983228878670668e-05,
"loss": 0.522,
"step": 491
},
{
"epoch": 0.2432332220986281,
"grad_norm": 0.12686259114961576,
"learning_rate": 1.9831577889093546e-05,
"loss": 0.5407,
"step": 492
},
{
"epoch": 0.24372759856630824,
"grad_norm": 0.13835858429131442,
"learning_rate": 1.983086550077721e-05,
"loss": 0.5526,
"step": 493
},
{
"epoch": 0.24422197503398838,
"grad_norm": 0.13311365555302848,
"learning_rate": 1.9830151621865682e-05,
"loss": 0.5321,
"step": 494
},
{
"epoch": 0.2447163515016685,
"grad_norm": 0.1384780600165611,
"learning_rate": 1.9829436252467208e-05,
"loss": 0.5538,
"step": 495
},
{
"epoch": 0.24521072796934865,
"grad_norm": 0.1314849381215902,
"learning_rate": 1.9828719392690252e-05,
"loss": 0.5107,
"step": 496
},
{
"epoch": 0.2457051044370288,
"grad_norm": 0.12436189670791688,
"learning_rate": 1.982800104264351e-05,
"loss": 0.5289,
"step": 497
},
{
"epoch": 0.24619948090470894,
"grad_norm": 0.12829278087890425,
"learning_rate": 1.9827281202435898e-05,
"loss": 0.5385,
"step": 498
},
{
"epoch": 0.24669385737238908,
"grad_norm": 0.1272565687678894,
"learning_rate": 1.982655987217656e-05,
"loss": 0.5378,
"step": 499
},
{
"epoch": 0.24718823384006922,
"grad_norm": 0.13870464446728384,
"learning_rate": 1.9825837051974874e-05,
"loss": 0.5403,
"step": 500
},
{
"epoch": 0.24768261030774935,
"grad_norm": 0.12932462205946474,
"learning_rate": 1.982511274194043e-05,
"loss": 0.5107,
"step": 501
},
{
"epoch": 0.2481769867754295,
"grad_norm": 0.1473818796661029,
"learning_rate": 1.9824386942183053e-05,
"loss": 0.502,
"step": 502
},
{
"epoch": 0.24867136324310962,
"grad_norm": 0.13988090405773956,
"learning_rate": 1.982365965281279e-05,
"loss": 0.5366,
"step": 503
},
{
"epoch": 0.24916573971078976,
"grad_norm": 0.150839464082402,
"learning_rate": 1.9822930873939923e-05,
"loss": 0.5294,
"step": 504
},
{
"epoch": 0.2496601161784699,
"grad_norm": 0.1209151399130438,
"learning_rate": 1.9822200605674942e-05,
"loss": 0.542,
"step": 505
},
{
"epoch": 0.25015449264615003,
"grad_norm": 0.1427392938594922,
"learning_rate": 1.982146884812858e-05,
"loss": 0.5452,
"step": 506
},
{
"epoch": 0.25015449264615003,
"eval_loss": 0.5370768308639526,
"eval_runtime": 100.9295,
"eval_samples_per_second": 300.745,
"eval_steps_per_second": 37.601,
"step": 506
},
{
"epoch": 0.25064886911383016,
"grad_norm": 0.1317955319600235,
"learning_rate": 1.9820735601411787e-05,
"loss": 0.5144,
"step": 507
},
{
"epoch": 0.2511432455815103,
"grad_norm": 0.14233108971422181,
"learning_rate": 1.982000086563574e-05,
"loss": 0.5336,
"step": 508
},
{
"epoch": 0.25163762204919043,
"grad_norm": 0.12647980741035628,
"learning_rate": 1.981926464091184e-05,
"loss": 0.5024,
"step": 509
},
{
"epoch": 0.25213199851687057,
"grad_norm": 0.1695041019255031,
"learning_rate": 1.9818526927351723e-05,
"loss": 0.5363,
"step": 510
},
{
"epoch": 0.25262637498455076,
"grad_norm": 0.1186146464165455,
"learning_rate": 1.981778772506724e-05,
"loss": 0.5411,
"step": 511
},
{
"epoch": 0.2531207514522309,
"grad_norm": 0.14562388109571825,
"learning_rate": 1.9817047034170477e-05,
"loss": 0.5495,
"step": 512
},
{
"epoch": 0.25361512791991103,
"grad_norm": 0.12461607805724693,
"learning_rate": 1.981630485477373e-05,
"loss": 0.5287,
"step": 513
},
{
"epoch": 0.25410950438759117,
"grad_norm": 0.17248721956698204,
"learning_rate": 1.9815561186989537e-05,
"loss": 0.5567,
"step": 514
},
{
"epoch": 0.2546038808552713,
"grad_norm": 0.13382060035640533,
"learning_rate": 1.981481603093066e-05,
"loss": 0.5101,
"step": 515
},
{
"epoch": 0.25509825732295144,
"grad_norm": 0.25996642277455645,
"learning_rate": 1.9814069386710076e-05,
"loss": 0.5526,
"step": 516
},
{
"epoch": 0.2555926337906316,
"grad_norm": 0.13915640251166403,
"learning_rate": 1.9813321254441e-05,
"loss": 0.5284,
"step": 517
},
{
"epoch": 0.2560870102583117,
"grad_norm": 0.12885801373338757,
"learning_rate": 1.9812571634236863e-05,
"loss": 0.5498,
"step": 518
},
{
"epoch": 0.25658138672599184,
"grad_norm": 1.1382998248815246,
"learning_rate": 1.981182052621132e-05,
"loss": 0.7078,
"step": 519
},
{
"epoch": 0.257075763193672,
"grad_norm": 0.15239039379678126,
"learning_rate": 1.9811067930478266e-05,
"loss": 0.5211,
"step": 520
},
{
"epoch": 0.2575701396613521,
"grad_norm": 0.13183882110688952,
"learning_rate": 1.9810313847151814e-05,
"loss": 0.5336,
"step": 521
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.1420746287265802,
"learning_rate": 1.9809558276346294e-05,
"loss": 0.5333,
"step": 522
},
{
"epoch": 0.2585588925967124,
"grad_norm": 0.13465407498257184,
"learning_rate": 1.9808801218176272e-05,
"loss": 0.5142,
"step": 523
},
{
"epoch": 0.2590532690643925,
"grad_norm": 1.5719562612565656,
"learning_rate": 1.9808042672756534e-05,
"loss": 0.5439,
"step": 524
},
{
"epoch": 0.25954764553207266,
"grad_norm": 0.14070797348977954,
"learning_rate": 1.9807282640202098e-05,
"loss": 0.5617,
"step": 525
},
{
"epoch": 0.2600420219997528,
"grad_norm": 0.15181751701619653,
"learning_rate": 1.9806521120628196e-05,
"loss": 0.5763,
"step": 526
},
{
"epoch": 0.26053639846743293,
"grad_norm": 0.1375371240157062,
"learning_rate": 1.9805758114150305e-05,
"loss": 0.5535,
"step": 527
},
{
"epoch": 0.26103077493511306,
"grad_norm": 0.18656161407383393,
"learning_rate": 1.98049936208841e-05,
"loss": 0.5354,
"step": 528
},
{
"epoch": 0.26152515140279325,
"grad_norm": 1.1654352333624456,
"learning_rate": 1.9804227640945508e-05,
"loss": 0.5709,
"step": 529
},
{
"epoch": 0.2620195278704734,
"grad_norm": 0.13760610104406837,
"learning_rate": 1.9803460174450662e-05,
"loss": 0.5202,
"step": 530
},
{
"epoch": 0.2625139043381535,
"grad_norm": 0.1680041731388263,
"learning_rate": 1.9802691221515936e-05,
"loss": 0.5673,
"step": 531
},
{
"epoch": 0.26300828080583366,
"grad_norm": 0.14575269519664613,
"learning_rate": 1.9801920782257914e-05,
"loss": 0.5182,
"step": 532
},
{
"epoch": 0.2635026572735138,
"grad_norm": 0.2008344214220722,
"learning_rate": 1.980114885679342e-05,
"loss": 0.5443,
"step": 533
},
{
"epoch": 0.26399703374119393,
"grad_norm": 0.1586187598174719,
"learning_rate": 1.9800375445239493e-05,
"loss": 0.5354,
"step": 534
},
{
"epoch": 0.26449141020887407,
"grad_norm": 0.13627973256007056,
"learning_rate": 1.97996005477134e-05,
"loss": 0.5374,
"step": 535
},
{
"epoch": 0.2649857866765542,
"grad_norm": 0.15106037102016545,
"learning_rate": 1.9798824164332635e-05,
"loss": 0.5115,
"step": 536
},
{
"epoch": 0.26548016314423434,
"grad_norm": 0.14141924536831174,
"learning_rate": 1.9798046295214918e-05,
"loss": 0.5313,
"step": 537
},
{
"epoch": 0.2659745396119145,
"grad_norm": 0.151920321677155,
"learning_rate": 1.979726694047819e-05,
"loss": 0.5577,
"step": 538
},
{
"epoch": 0.2664689160795946,
"grad_norm": 0.15890436943117506,
"learning_rate": 1.979648610024062e-05,
"loss": 0.5459,
"step": 539
},
{
"epoch": 0.26696329254727474,
"grad_norm": 0.14574404761405327,
"learning_rate": 1.9795703774620608e-05,
"loss": 0.5582,
"step": 540
},
{
"epoch": 0.2674576690149549,
"grad_norm": 0.15798070977868273,
"learning_rate": 1.979491996373676e-05,
"loss": 0.5507,
"step": 541
},
{
"epoch": 0.267952045482635,
"grad_norm": 0.12425866689756139,
"learning_rate": 1.9794134667707938e-05,
"loss": 0.5427,
"step": 542
},
{
"epoch": 0.26844642195031515,
"grad_norm": 0.14303864210609338,
"learning_rate": 1.97933478866532e-05,
"loss": 0.5377,
"step": 543
},
{
"epoch": 0.2689407984179953,
"grad_norm": 0.12928985159205597,
"learning_rate": 1.979255962069184e-05,
"loss": 0.5273,
"step": 544
},
{
"epoch": 0.2694351748856754,
"grad_norm": 0.15545707830222702,
"learning_rate": 1.9791769869943384e-05,
"loss": 0.5427,
"step": 545
},
{
"epoch": 0.26992955135335556,
"grad_norm": 0.1288448868832813,
"learning_rate": 1.9790978634527577e-05,
"loss": 0.5498,
"step": 546
},
{
"epoch": 0.2704239278210357,
"grad_norm": 0.13478501919451955,
"learning_rate": 1.9790185914564385e-05,
"loss": 0.5229,
"step": 547
},
{
"epoch": 0.2709183042887159,
"grad_norm": 0.53874843912914,
"learning_rate": 1.9789391710174005e-05,
"loss": 0.5313,
"step": 548
},
{
"epoch": 0.271412680756396,
"grad_norm": 0.13773726017464047,
"learning_rate": 1.978859602147686e-05,
"loss": 0.533,
"step": 549
},
{
"epoch": 0.27190705722407615,
"grad_norm": 0.13495046357761317,
"learning_rate": 1.978779884859359e-05,
"loss": 0.5347,
"step": 550
},
{
"epoch": 0.2724014336917563,
"grad_norm": 0.224580525112084,
"learning_rate": 1.9787000191645072e-05,
"loss": 0.5534,
"step": 551
},
{
"epoch": 0.2728958101594364,
"grad_norm": 0.14273141878562484,
"learning_rate": 1.97862000507524e-05,
"loss": 0.5634,
"step": 552
},
{
"epoch": 0.27339018662711656,
"grad_norm": 0.14022804453621376,
"learning_rate": 1.9785398426036888e-05,
"loss": 0.5503,
"step": 553
},
{
"epoch": 0.2738845630947967,
"grad_norm": 0.16168406221001033,
"learning_rate": 1.9784595317620093e-05,
"loss": 0.5423,
"step": 554
},
{
"epoch": 0.27437893956247683,
"grad_norm": 0.14142475230497964,
"learning_rate": 1.9783790725623776e-05,
"loss": 0.5438,
"step": 555
},
{
"epoch": 0.27487331603015697,
"grad_norm": 0.15471063851796377,
"learning_rate": 1.9782984650169934e-05,
"loss": 0.5562,
"step": 556
},
{
"epoch": 0.2753676924978371,
"grad_norm": 0.15567512542379844,
"learning_rate": 1.978217709138079e-05,
"loss": 0.5638,
"step": 557
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.14097216981636962,
"learning_rate": 1.9781368049378788e-05,
"loss": 0.5391,
"step": 558
},
{
"epoch": 0.2763564454331974,
"grad_norm": 0.24469762403434656,
"learning_rate": 1.9780557524286602e-05,
"loss": 0.5424,
"step": 559
},
{
"epoch": 0.2768508219008775,
"grad_norm": 0.14442998111187433,
"learning_rate": 1.977974551622712e-05,
"loss": 0.5147,
"step": 560
},
{
"epoch": 0.27734519836855764,
"grad_norm": 1.167199094994208,
"learning_rate": 1.977893202532347e-05,
"loss": 0.5545,
"step": 561
},
{
"epoch": 0.2778395748362378,
"grad_norm": 0.13405208963838194,
"learning_rate": 1.9778117051698987e-05,
"loss": 0.5332,
"step": 562
},
{
"epoch": 0.2783339513039179,
"grad_norm": 0.1285716123147683,
"learning_rate": 1.9777300595477248e-05,
"loss": 0.5346,
"step": 563
},
{
"epoch": 0.27882832777159805,
"grad_norm": 0.13303402939071393,
"learning_rate": 1.9776482656782043e-05,
"loss": 0.5109,
"step": 564
},
{
"epoch": 0.2793227042392782,
"grad_norm": 0.24218050020176693,
"learning_rate": 1.9775663235737397e-05,
"loss": 0.5477,
"step": 565
},
{
"epoch": 0.2798170807069584,
"grad_norm": 0.12125093882853527,
"learning_rate": 1.977484233246755e-05,
"loss": 0.5183,
"step": 566
},
{
"epoch": 0.2803114571746385,
"grad_norm": 1.698953231733813,
"learning_rate": 1.977401994709697e-05,
"loss": 0.6325,
"step": 567
},
{
"epoch": 0.28080583364231865,
"grad_norm": 0.14533327543066626,
"learning_rate": 1.977319607975035e-05,
"loss": 0.587,
"step": 568
},
{
"epoch": 0.2813002101099988,
"grad_norm": 0.20106639368983653,
"learning_rate": 1.977237073055261e-05,
"loss": 0.5219,
"step": 569
},
{
"epoch": 0.2817945865776789,
"grad_norm": 0.28844227906726033,
"learning_rate": 1.9771543899628892e-05,
"loss": 0.5764,
"step": 570
},
{
"epoch": 0.28228896304535905,
"grad_norm": 0.14266485587044883,
"learning_rate": 1.9770715587104565e-05,
"loss": 0.5223,
"step": 571
},
{
"epoch": 0.2827833395130392,
"grad_norm": 0.17938569097318702,
"learning_rate": 1.9769885793105217e-05,
"loss": 0.5403,
"step": 572
},
{
"epoch": 0.2832777159807193,
"grad_norm": 0.16027566502607776,
"learning_rate": 1.9769054517756666e-05,
"loss": 0.5287,
"step": 573
},
{
"epoch": 0.28377209244839946,
"grad_norm": 0.14392076831995626,
"learning_rate": 1.9768221761184958e-05,
"loss": 0.5289,
"step": 574
},
{
"epoch": 0.2842664689160796,
"grad_norm": 0.14172303046115992,
"learning_rate": 1.9767387523516354e-05,
"loss": 0.5526,
"step": 575
},
{
"epoch": 0.28476084538375973,
"grad_norm": 0.14039437028135332,
"learning_rate": 1.9766551804877348e-05,
"loss": 0.5548,
"step": 576
},
{
"epoch": 0.28525522185143987,
"grad_norm": 0.14751897843558465,
"learning_rate": 1.9765714605394652e-05,
"loss": 0.5228,
"step": 577
},
{
"epoch": 0.28574959831912,
"grad_norm": 0.1367790371855184,
"learning_rate": 1.9764875925195202e-05,
"loss": 0.5242,
"step": 578
},
{
"epoch": 0.28624397478680014,
"grad_norm": 0.14346352480669264,
"learning_rate": 1.9764035764406172e-05,
"loss": 0.5217,
"step": 579
},
{
"epoch": 0.2867383512544803,
"grad_norm": 0.1290861984016415,
"learning_rate": 1.9763194123154946e-05,
"loss": 0.5327,
"step": 580
},
{
"epoch": 0.2872327277221604,
"grad_norm": 0.1353871399303147,
"learning_rate": 1.976235100156913e-05,
"loss": 0.5368,
"step": 581
},
{
"epoch": 0.28772710418984054,
"grad_norm": 0.133940265265466,
"learning_rate": 1.9761506399776573e-05,
"loss": 0.552,
"step": 582
},
{
"epoch": 0.2882214806575207,
"grad_norm": 0.1430623871790493,
"learning_rate": 1.976066031790533e-05,
"loss": 0.5506,
"step": 583
},
{
"epoch": 0.2887158571252008,
"grad_norm": 0.1391758280067808,
"learning_rate": 1.975981275608369e-05,
"loss": 0.5736,
"step": 584
},
{
"epoch": 0.289210233592881,
"grad_norm": 0.14583778058739796,
"learning_rate": 1.975896371444016e-05,
"loss": 0.5351,
"step": 585
},
{
"epoch": 0.28970461006056114,
"grad_norm": 0.5020961287990244,
"learning_rate": 1.9758113193103473e-05,
"loss": 0.5388,
"step": 586
},
{
"epoch": 0.2901989865282413,
"grad_norm": 0.14764553233090033,
"learning_rate": 1.97572611922026e-05,
"loss": 0.5249,
"step": 587
},
{
"epoch": 0.2906933629959214,
"grad_norm": 0.13174767428261996,
"learning_rate": 1.9756407711866715e-05,
"loss": 0.5536,
"step": 588
},
{
"epoch": 0.29118773946360155,
"grad_norm": 0.13566457841449767,
"learning_rate": 1.975555275222523e-05,
"loss": 0.5128,
"step": 589
},
{
"epoch": 0.2916821159312817,
"grad_norm": 0.5711932669290958,
"learning_rate": 1.9754696313407776e-05,
"loss": 0.5498,
"step": 590
},
{
"epoch": 0.2921764923989618,
"grad_norm": 0.12606525033108432,
"learning_rate": 1.9753838395544208e-05,
"loss": 0.5081,
"step": 591
},
{
"epoch": 0.29267086886664195,
"grad_norm": 0.1461049665245137,
"learning_rate": 1.975297899876461e-05,
"loss": 0.5043,
"step": 592
},
{
"epoch": 0.2931652453343221,
"grad_norm": 0.12833531594193462,
"learning_rate": 1.9752118123199285e-05,
"loss": 0.5713,
"step": 593
},
{
"epoch": 0.2936596218020022,
"grad_norm": 0.13739807143836225,
"learning_rate": 1.9751255768978765e-05,
"loss": 0.5317,
"step": 594
},
{
"epoch": 0.29415399826968236,
"grad_norm": 0.12100895649616715,
"learning_rate": 1.9750391936233802e-05,
"loss": 0.549,
"step": 595
},
{
"epoch": 0.2946483747373625,
"grad_norm": 0.1202775459772075,
"learning_rate": 1.9749526625095376e-05,
"loss": 0.5154,
"step": 596
},
{
"epoch": 0.29514275120504263,
"grad_norm": 0.12633929578059877,
"learning_rate": 1.9748659835694687e-05,
"loss": 0.5114,
"step": 597
},
{
"epoch": 0.29563712767272277,
"grad_norm": 0.12925544824013205,
"learning_rate": 1.9747791568163158e-05,
"loss": 0.5561,
"step": 598
},
{
"epoch": 0.2961315041404029,
"grad_norm": 0.2300251711418817,
"learning_rate": 1.9746921822632442e-05,
"loss": 0.6106,
"step": 599
},
{
"epoch": 0.29662588060808304,
"grad_norm": 0.13363136943565565,
"learning_rate": 1.9746050599234414e-05,
"loss": 0.5727,
"step": 600
},
{
"epoch": 0.29712025707576317,
"grad_norm": 0.12722315160175843,
"learning_rate": 1.9745177898101173e-05,
"loss": 0.5548,
"step": 601
},
{
"epoch": 0.2976146335434433,
"grad_norm": 0.42075394802114263,
"learning_rate": 1.974430371936504e-05,
"loss": 0.5545,
"step": 602
},
{
"epoch": 0.29810901001112344,
"grad_norm": 0.128371236113254,
"learning_rate": 1.974342806315856e-05,
"loss": 0.5337,
"step": 603
},
{
"epoch": 0.29860338647880363,
"grad_norm": 0.13301910062710529,
"learning_rate": 1.9742550929614505e-05,
"loss": 0.5324,
"step": 604
},
{
"epoch": 0.29909776294648377,
"grad_norm": 0.1624487851794696,
"learning_rate": 1.974167231886587e-05,
"loss": 0.4966,
"step": 605
},
{
"epoch": 0.2995921394141639,
"grad_norm": 0.12316894068232682,
"learning_rate": 1.9740792231045872e-05,
"loss": 0.5436,
"step": 606
},
{
"epoch": 0.30008651588184404,
"grad_norm": 0.12618490173419122,
"learning_rate": 1.973991066628796e-05,
"loss": 0.5318,
"step": 607
},
{
"epoch": 0.3005808923495242,
"grad_norm": 0.13562555573951643,
"learning_rate": 1.9739027624725788e-05,
"loss": 0.5326,
"step": 608
},
{
"epoch": 0.3010752688172043,
"grad_norm": 0.27093261261941476,
"learning_rate": 1.973814310649326e-05,
"loss": 0.5527,
"step": 609
},
{
"epoch": 0.30156964528488445,
"grad_norm": 0.12354171544700583,
"learning_rate": 1.9737257111724476e-05,
"loss": 0.5175,
"step": 610
},
{
"epoch": 0.3020640217525646,
"grad_norm": 0.14169700224291637,
"learning_rate": 1.9736369640553787e-05,
"loss": 0.5476,
"step": 611
},
{
"epoch": 0.3025583982202447,
"grad_norm": 0.12825178622629602,
"learning_rate": 1.973548069311575e-05,
"loss": 0.5373,
"step": 612
},
{
"epoch": 0.30305277468792485,
"grad_norm": 0.14620720329954326,
"learning_rate": 1.9734590269545147e-05,
"loss": 0.5573,
"step": 613
},
{
"epoch": 0.303547151155605,
"grad_norm": 0.13855697953732984,
"learning_rate": 1.9733698369976993e-05,
"loss": 0.5507,
"step": 614
},
{
"epoch": 0.3040415276232851,
"grad_norm": 0.3144608942465845,
"learning_rate": 1.973280499454652e-05,
"loss": 0.5434,
"step": 615
},
{
"epoch": 0.30453590409096526,
"grad_norm": 0.13204847785973764,
"learning_rate": 1.973191014338918e-05,
"loss": 0.5537,
"step": 616
},
{
"epoch": 0.3050302805586454,
"grad_norm": 0.14782730423416013,
"learning_rate": 1.973101381664066e-05,
"loss": 0.5181,
"step": 617
},
{
"epoch": 0.30552465702632553,
"grad_norm": 0.13273569716037195,
"learning_rate": 1.9730116014436867e-05,
"loss": 0.5566,
"step": 618
},
{
"epoch": 0.30601903349400567,
"grad_norm": 0.220335733066492,
"learning_rate": 1.9729216736913922e-05,
"loss": 0.5308,
"step": 619
},
{
"epoch": 0.3065134099616858,
"grad_norm": 0.529724204213462,
"learning_rate": 1.972831598420818e-05,
"loss": 0.6006,
"step": 620
},
{
"epoch": 0.30700778642936594,
"grad_norm": 0.18464846302410837,
"learning_rate": 1.972741375645622e-05,
"loss": 0.5996,
"step": 621
},
{
"epoch": 0.3075021628970461,
"grad_norm": 0.12544689191975636,
"learning_rate": 1.9726510053794834e-05,
"loss": 0.5305,
"step": 622
},
{
"epoch": 0.30799653936472626,
"grad_norm": 0.22119492831949092,
"learning_rate": 1.972560487636105e-05,
"loss": 0.5461,
"step": 623
},
{
"epoch": 0.3084909158324064,
"grad_norm": 0.1867207838496053,
"learning_rate": 1.9724698224292118e-05,
"loss": 0.5479,
"step": 624
},
{
"epoch": 0.30898529230008653,
"grad_norm": 0.1346100604172959,
"learning_rate": 1.9723790097725503e-05,
"loss": 0.5581,
"step": 625
},
{
"epoch": 0.30947966876776667,
"grad_norm": 0.12475770893785193,
"learning_rate": 1.97228804967989e-05,
"loss": 0.5141,
"step": 626
},
{
"epoch": 0.3099740452354468,
"grad_norm": 0.12221891107934275,
"learning_rate": 1.9721969421650223e-05,
"loss": 0.5359,
"step": 627
},
{
"epoch": 0.31046842170312694,
"grad_norm": 0.22369613158085294,
"learning_rate": 1.972105687241762e-05,
"loss": 0.5508,
"step": 628
},
{
"epoch": 0.3109627981708071,
"grad_norm": 0.12633584954868576,
"learning_rate": 1.972014284923945e-05,
"loss": 0.5416,
"step": 629
},
{
"epoch": 0.3114571746384872,
"grad_norm": 0.16812225042385873,
"learning_rate": 1.9719227352254307e-05,
"loss": 0.5361,
"step": 630
},
{
"epoch": 0.31195155110616735,
"grad_norm": 0.1952417488359483,
"learning_rate": 1.9718310381600992e-05,
"loss": 0.5921,
"step": 631
},
{
"epoch": 0.3124459275738475,
"grad_norm": 0.12519282889632175,
"learning_rate": 1.971739193741855e-05,
"loss": 0.5699,
"step": 632
},
{
"epoch": 0.3129403040415276,
"grad_norm": 0.13593806882502024,
"learning_rate": 1.9716472019846233e-05,
"loss": 0.5277,
"step": 633
},
{
"epoch": 0.31343468050920775,
"grad_norm": 0.11998208317698338,
"learning_rate": 1.9715550629023524e-05,
"loss": 0.539,
"step": 634
},
{
"epoch": 0.3139290569768879,
"grad_norm": 0.12596361627151206,
"learning_rate": 1.9714627765090126e-05,
"loss": 0.5384,
"step": 635
},
{
"epoch": 0.314423433444568,
"grad_norm": 0.7731972434927932,
"learning_rate": 1.9713703428185972e-05,
"loss": 0.5369,
"step": 636
},
{
"epoch": 0.31491780991224816,
"grad_norm": 0.1188478862649685,
"learning_rate": 1.9712777618451212e-05,
"loss": 0.518,
"step": 637
},
{
"epoch": 0.3154121863799283,
"grad_norm": 0.12355341995453166,
"learning_rate": 1.971185033602622e-05,
"loss": 0.5749,
"step": 638
},
{
"epoch": 0.31590656284760843,
"grad_norm": 0.12301345482895641,
"learning_rate": 1.9710921581051593e-05,
"loss": 0.5261,
"step": 639
},
{
"epoch": 0.31640093931528857,
"grad_norm": 0.1308819860715845,
"learning_rate": 1.9709991353668156e-05,
"loss": 0.5452,
"step": 640
},
{
"epoch": 0.31689531578296876,
"grad_norm": 0.1397499269669722,
"learning_rate": 1.9709059654016953e-05,
"loss": 0.5393,
"step": 641
},
{
"epoch": 0.3173896922506489,
"grad_norm": 0.12341424694045226,
"learning_rate": 1.9708126482239248e-05,
"loss": 0.5096,
"step": 642
},
{
"epoch": 0.317884068718329,
"grad_norm": 0.13039258229590434,
"learning_rate": 1.9707191838476538e-05,
"loss": 0.5381,
"step": 643
},
{
"epoch": 0.31837844518600916,
"grad_norm": 0.15335402497861594,
"learning_rate": 1.9706255722870536e-05,
"loss": 0.5069,
"step": 644
},
{
"epoch": 0.3188728216536893,
"grad_norm": 0.11288377595236967,
"learning_rate": 1.9705318135563173e-05,
"loss": 0.5029,
"step": 645
},
{
"epoch": 0.31936719812136943,
"grad_norm": 0.12349948296688901,
"learning_rate": 1.9704379076696617e-05,
"loss": 0.5066,
"step": 646
},
{
"epoch": 0.31986157458904957,
"grad_norm": 0.11606727679074605,
"learning_rate": 1.9703438546413252e-05,
"loss": 0.505,
"step": 647
},
{
"epoch": 0.3203559510567297,
"grad_norm": 0.1224832026473203,
"learning_rate": 1.970249654485568e-05,
"loss": 0.5389,
"step": 648
},
{
"epoch": 0.32085032752440984,
"grad_norm": 0.12008453329267979,
"learning_rate": 1.9701553072166735e-05,
"loss": 0.5444,
"step": 649
},
{
"epoch": 0.32134470399209,
"grad_norm": 0.11744589265716511,
"learning_rate": 1.970060812848947e-05,
"loss": 0.5362,
"step": 650
},
{
"epoch": 0.3218390804597701,
"grad_norm": 0.24252992372279641,
"learning_rate": 1.9699661713967158e-05,
"loss": 0.5561,
"step": 651
},
{
"epoch": 0.32233345692745025,
"grad_norm": 0.11908308740918042,
"learning_rate": 1.9698713828743304e-05,
"loss": 0.4971,
"step": 652
},
{
"epoch": 0.3228278333951304,
"grad_norm": 0.1198124384838922,
"learning_rate": 1.9697764472961623e-05,
"loss": 0.5109,
"step": 653
},
{
"epoch": 0.3233222098628105,
"grad_norm": 0.11517725373846983,
"learning_rate": 1.9696813646766064e-05,
"loss": 0.5113,
"step": 654
},
{
"epoch": 0.32381658633049065,
"grad_norm": 0.21002990095179938,
"learning_rate": 1.9695861350300798e-05,
"loss": 0.5487,
"step": 655
},
{
"epoch": 0.3243109627981708,
"grad_norm": 1.0195095478478853,
"learning_rate": 1.9694907583710207e-05,
"loss": 0.5776,
"step": 656
},
{
"epoch": 0.3248053392658509,
"grad_norm": 0.14024635457733337,
"learning_rate": 1.9693952347138917e-05,
"loss": 0.5202,
"step": 657
},
{
"epoch": 0.32529971573353106,
"grad_norm": 0.15665652116218484,
"learning_rate": 1.9692995640731753e-05,
"loss": 0.5373,
"step": 658
},
{
"epoch": 0.32579409220121125,
"grad_norm": 0.1668043516171802,
"learning_rate": 1.9692037464633782e-05,
"loss": 0.5246,
"step": 659
},
{
"epoch": 0.3262884686688914,
"grad_norm": 0.2927796038032934,
"learning_rate": 1.9691077818990284e-05,
"loss": 0.5493,
"step": 660
},
{
"epoch": 0.3267828451365715,
"grad_norm": 0.15252040903738667,
"learning_rate": 1.9690116703946765e-05,
"loss": 0.5475,
"step": 661
},
{
"epoch": 0.32727722160425166,
"grad_norm": 0.18706315568450854,
"learning_rate": 1.9689154119648952e-05,
"loss": 0.5472,
"step": 662
},
{
"epoch": 0.3277715980719318,
"grad_norm": 0.15327513759613323,
"learning_rate": 1.96881900662428e-05,
"loss": 0.5459,
"step": 663
},
{
"epoch": 0.3282659745396119,
"grad_norm": 0.14154467909067678,
"learning_rate": 1.9687224543874474e-05,
"loss": 0.5513,
"step": 664
},
{
"epoch": 0.32876035100729206,
"grad_norm": 0.14481076257997583,
"learning_rate": 1.968625755269038e-05,
"loss": 0.5027,
"step": 665
},
{
"epoch": 0.3292547274749722,
"grad_norm": 0.1388399727985029,
"learning_rate": 1.9685289092837135e-05,
"loss": 0.5192,
"step": 666
},
{
"epoch": 0.32974910394265233,
"grad_norm": 0.6156807204069455,
"learning_rate": 1.9684319164461573e-05,
"loss": 0.5964,
"step": 667
},
{
"epoch": 0.33024348041033247,
"grad_norm": 0.17259621934745165,
"learning_rate": 1.9683347767710765e-05,
"loss": 0.5616,
"step": 668
},
{
"epoch": 0.3307378568780126,
"grad_norm": 0.1308929514037825,
"learning_rate": 1.9682374902732003e-05,
"loss": 0.4987,
"step": 669
},
{
"epoch": 0.33123223334569274,
"grad_norm": 0.1589222660001957,
"learning_rate": 1.9681400569672786e-05,
"loss": 0.5249,
"step": 670
},
{
"epoch": 0.3317266098133729,
"grad_norm": 0.12583022145497202,
"learning_rate": 1.968042476868085e-05,
"loss": 0.5098,
"step": 671
},
{
"epoch": 0.332220986281053,
"grad_norm": 0.14791095497137316,
"learning_rate": 1.9679447499904153e-05,
"loss": 0.5353,
"step": 672
},
{
"epoch": 0.33271536274873315,
"grad_norm": 0.139068499537895,
"learning_rate": 1.967846876349087e-05,
"loss": 0.5357,
"step": 673
},
{
"epoch": 0.3332097392164133,
"grad_norm": 0.13206348990554664,
"learning_rate": 1.9677488559589403e-05,
"loss": 0.4992,
"step": 674
},
{
"epoch": 0.3337041156840934,
"grad_norm": 0.12903843357572198,
"learning_rate": 1.967650688834837e-05,
"loss": 0.5373,
"step": 675
},
{
"epoch": 0.33419849215177355,
"grad_norm": 0.13563325269608034,
"learning_rate": 1.967552374991662e-05,
"loss": 0.5143,
"step": 676
},
{
"epoch": 0.3346928686194537,
"grad_norm": 0.12676491737172524,
"learning_rate": 1.9674539144443217e-05,
"loss": 0.5147,
"step": 677
},
{
"epoch": 0.3351872450871339,
"grad_norm": 0.12996069424625978,
"learning_rate": 1.9673553072077454e-05,
"loss": 0.5455,
"step": 678
},
{
"epoch": 0.335681621554814,
"grad_norm": 0.13097310915868082,
"learning_rate": 1.9672565532968844e-05,
"loss": 0.5286,
"step": 679
},
{
"epoch": 0.33617599802249415,
"grad_norm": 0.13677665252710247,
"learning_rate": 1.9671576527267118e-05,
"loss": 0.4958,
"step": 680
},
{
"epoch": 0.3366703744901743,
"grad_norm": 0.1300336069711607,
"learning_rate": 1.9670586055122234e-05,
"loss": 0.5103,
"step": 681
},
{
"epoch": 0.3371647509578544,
"grad_norm": 0.47995730462252356,
"learning_rate": 1.9669594116684375e-05,
"loss": 0.5574,
"step": 682
},
{
"epoch": 0.33765912742553456,
"grad_norm": 0.17641837075158115,
"learning_rate": 1.966860071210394e-05,
"loss": 0.5249,
"step": 683
},
{
"epoch": 0.3381535038932147,
"grad_norm": 0.12229110835144939,
"learning_rate": 1.9667605841531548e-05,
"loss": 0.5334,
"step": 684
},
{
"epoch": 0.3386478803608948,
"grad_norm": 0.11719000457965606,
"learning_rate": 1.9666609505118053e-05,
"loss": 0.5134,
"step": 685
},
{
"epoch": 0.33914225682857496,
"grad_norm": 0.16687010385378812,
"learning_rate": 1.966561170301452e-05,
"loss": 0.5515,
"step": 686
},
{
"epoch": 0.3396366332962551,
"grad_norm": 0.12986685094668812,
"learning_rate": 1.9664612435372242e-05,
"loss": 0.5352,
"step": 687
},
{
"epoch": 0.34013100976393523,
"grad_norm": 0.12393941293857856,
"learning_rate": 1.9663611702342728e-05,
"loss": 0.5566,
"step": 688
},
{
"epoch": 0.34062538623161537,
"grad_norm": 0.12708571403995403,
"learning_rate": 1.9662609504077715e-05,
"loss": 0.5258,
"step": 689
},
{
"epoch": 0.3411197626992955,
"grad_norm": 0.1472776520340462,
"learning_rate": 1.9661605840729164e-05,
"loss": 0.5305,
"step": 690
},
{
"epoch": 0.34161413916697564,
"grad_norm": 0.12667368278483182,
"learning_rate": 1.9660600712449247e-05,
"loss": 0.5556,
"step": 691
},
{
"epoch": 0.3421085156346558,
"grad_norm": 0.2167094608182051,
"learning_rate": 1.9659594119390372e-05,
"loss": 0.5437,
"step": 692
},
{
"epoch": 0.3426028921023359,
"grad_norm": 0.13433681011721635,
"learning_rate": 1.965858606170516e-05,
"loss": 0.526,
"step": 693
},
{
"epoch": 0.34309726857001605,
"grad_norm": 0.1431960574658435,
"learning_rate": 1.9657576539546456e-05,
"loss": 0.5625,
"step": 694
},
{
"epoch": 0.3435916450376962,
"grad_norm": 0.13292820588891957,
"learning_rate": 1.9656565553067332e-05,
"loss": 0.5672,
"step": 695
},
{
"epoch": 0.34408602150537637,
"grad_norm": 0.14552737244953182,
"learning_rate": 1.9655553102421076e-05,
"loss": 0.515,
"step": 696
},
{
"epoch": 0.3445803979730565,
"grad_norm": 0.12043299529290434,
"learning_rate": 1.9654539187761193e-05,
"loss": 0.5016,
"step": 697
},
{
"epoch": 0.34507477444073664,
"grad_norm": 0.131488343368556,
"learning_rate": 1.9653523809241424e-05,
"loss": 0.5378,
"step": 698
},
{
"epoch": 0.3455691509084168,
"grad_norm": 0.1292428029192406,
"learning_rate": 1.965250696701572e-05,
"loss": 0.5061,
"step": 699
},
{
"epoch": 0.3460635273760969,
"grad_norm": 0.13112582211430657,
"learning_rate": 1.9651488661238273e-05,
"loss": 0.5201,
"step": 700
},
{
"epoch": 0.34655790384377705,
"grad_norm": 0.1560812435727658,
"learning_rate": 1.9650468892063462e-05,
"loss": 0.5205,
"step": 701
},
{
"epoch": 0.3470522803114572,
"grad_norm": 0.15584882333031627,
"learning_rate": 1.964944765964592e-05,
"loss": 0.5142,
"step": 702
},
{
"epoch": 0.3475466567791373,
"grad_norm": 0.18493557359220864,
"learning_rate": 1.9648424964140486e-05,
"loss": 0.5367,
"step": 703
},
{
"epoch": 0.34804103324681746,
"grad_norm": 0.13850630187485663,
"learning_rate": 1.9647400805702233e-05,
"loss": 0.496,
"step": 704
},
{
"epoch": 0.3485354097144976,
"grad_norm": 0.11964462116416809,
"learning_rate": 1.964637518448644e-05,
"loss": 0.5089,
"step": 705
},
{
"epoch": 0.3490297861821777,
"grad_norm": 0.12509491239365456,
"learning_rate": 1.9645348100648617e-05,
"loss": 0.5092,
"step": 706
},
{
"epoch": 0.34952416264985786,
"grad_norm": 0.1293046031606314,
"learning_rate": 1.9644319554344496e-05,
"loss": 0.5125,
"step": 707
},
{
"epoch": 0.350018539117538,
"grad_norm": 0.22036035366266618,
"learning_rate": 1.9643289545730028e-05,
"loss": 0.4983,
"step": 708
},
{
"epoch": 0.35051291558521813,
"grad_norm": 0.13883985086190825,
"learning_rate": 1.9642258074961388e-05,
"loss": 0.5412,
"step": 709
},
{
"epoch": 0.35100729205289827,
"grad_norm": 0.12360693729501804,
"learning_rate": 1.9641225142194974e-05,
"loss": 0.5187,
"step": 710
},
{
"epoch": 0.3515016685205784,
"grad_norm": 0.13500782796378485,
"learning_rate": 1.96401907475874e-05,
"loss": 0.5232,
"step": 711
},
{
"epoch": 0.35199604498825854,
"grad_norm": 0.12480667966814275,
"learning_rate": 1.963915489129551e-05,
"loss": 0.5202,
"step": 712
},
{
"epoch": 0.3524904214559387,
"grad_norm": 0.11381324212779681,
"learning_rate": 1.9638117573476356e-05,
"loss": 0.5111,
"step": 713
},
{
"epoch": 0.3529847979236188,
"grad_norm": 0.13746607881036724,
"learning_rate": 1.963707879428723e-05,
"loss": 0.5245,
"step": 714
},
{
"epoch": 0.353479174391299,
"grad_norm": 0.12477748624602462,
"learning_rate": 1.963603855388563e-05,
"loss": 0.5082,
"step": 715
},
{
"epoch": 0.35397355085897914,
"grad_norm": 0.12516534949851357,
"learning_rate": 1.963499685242928e-05,
"loss": 0.5433,
"step": 716
},
{
"epoch": 0.35446792732665927,
"grad_norm": 0.8916299303354146,
"learning_rate": 1.963395369007613e-05,
"loss": 0.5712,
"step": 717
},
{
"epoch": 0.3549623037943394,
"grad_norm": 0.12746953083368778,
"learning_rate": 1.963290906698435e-05,
"loss": 0.4922,
"step": 718
},
{
"epoch": 0.35545668026201954,
"grad_norm": 0.11757018324925317,
"learning_rate": 1.9631862983312326e-05,
"loss": 0.4968,
"step": 719
},
{
"epoch": 0.3559510567296997,
"grad_norm": 0.1385140360120195,
"learning_rate": 1.963081543921867e-05,
"loss": 0.5641,
"step": 720
},
{
"epoch": 0.3564454331973798,
"grad_norm": 0.13155746127068435,
"learning_rate": 1.9629766434862216e-05,
"loss": 0.4962,
"step": 721
},
{
"epoch": 0.35693980966505995,
"grad_norm": 0.1205601550520086,
"learning_rate": 1.962871597040202e-05,
"loss": 0.5272,
"step": 722
},
{
"epoch": 0.3574341861327401,
"grad_norm": 0.12574968551810145,
"learning_rate": 1.962766404599736e-05,
"loss": 0.5108,
"step": 723
},
{
"epoch": 0.3579285626004202,
"grad_norm": 0.14375967687164,
"learning_rate": 1.9626610661807723e-05,
"loss": 0.55,
"step": 724
},
{
"epoch": 0.35842293906810035,
"grad_norm": 0.13617533806055085,
"learning_rate": 1.9625555817992837e-05,
"loss": 0.5774,
"step": 725
},
{
"epoch": 0.3589173155357805,
"grad_norm": 0.5440753203886518,
"learning_rate": 1.9624499514712637e-05,
"loss": 0.6414,
"step": 726
},
{
"epoch": 0.3594116920034606,
"grad_norm": 0.13176030046261922,
"learning_rate": 1.9623441752127284e-05,
"loss": 0.5121,
"step": 727
},
{
"epoch": 0.35990606847114076,
"grad_norm": 0.14094242085793585,
"learning_rate": 1.962238253039716e-05,
"loss": 0.5501,
"step": 728
},
{
"epoch": 0.3604004449388209,
"grad_norm": 0.12630671997919654,
"learning_rate": 1.962132184968287e-05,
"loss": 0.5189,
"step": 729
},
{
"epoch": 0.36089482140650103,
"grad_norm": 0.15018005813165136,
"learning_rate": 1.962025971014524e-05,
"loss": 0.5503,
"step": 730
},
{
"epoch": 0.36138919787418117,
"grad_norm": 0.11927501584484901,
"learning_rate": 1.961919611194531e-05,
"loss": 0.511,
"step": 731
},
{
"epoch": 0.3618835743418613,
"grad_norm": 0.1604902451550611,
"learning_rate": 1.9618131055244355e-05,
"loss": 0.536,
"step": 732
},
{
"epoch": 0.36237795080954144,
"grad_norm": 0.2594644123519653,
"learning_rate": 1.9617064540203858e-05,
"loss": 0.5377,
"step": 733
},
{
"epoch": 0.36287232727722163,
"grad_norm": 0.12812206414163724,
"learning_rate": 1.961599656698553e-05,
"loss": 0.5076,
"step": 734
},
{
"epoch": 0.36336670374490176,
"grad_norm": 0.11694087180360785,
"learning_rate": 1.9614927135751302e-05,
"loss": 0.5216,
"step": 735
},
{
"epoch": 0.3638610802125819,
"grad_norm": 0.15838502675089408,
"learning_rate": 1.9613856246663324e-05,
"loss": 0.5379,
"step": 736
},
{
"epoch": 0.36435545668026204,
"grad_norm": 0.12458665764837283,
"learning_rate": 1.9612783899883964e-05,
"loss": 0.544,
"step": 737
},
{
"epoch": 0.36484983314794217,
"grad_norm": 0.1356438001755137,
"learning_rate": 1.9611710095575828e-05,
"loss": 0.5319,
"step": 738
},
{
"epoch": 0.3653442096156223,
"grad_norm": 0.13553749506782095,
"learning_rate": 1.961063483390172e-05,
"loss": 0.5341,
"step": 739
},
{
"epoch": 0.36583858608330244,
"grad_norm": 0.4227711147261919,
"learning_rate": 1.9609558115024673e-05,
"loss": 0.5165,
"step": 740
},
{
"epoch": 0.3663329625509826,
"grad_norm": 0.13986871696402575,
"learning_rate": 1.9608479939107952e-05,
"loss": 0.5211,
"step": 741
},
{
"epoch": 0.3668273390186627,
"grad_norm": 0.12497959937230758,
"learning_rate": 1.9607400306315033e-05,
"loss": 0.4992,
"step": 742
},
{
"epoch": 0.36732171548634285,
"grad_norm": 0.16958248444772878,
"learning_rate": 1.9606319216809614e-05,
"loss": 0.5855,
"step": 743
},
{
"epoch": 0.367816091954023,
"grad_norm": 0.13150746163175986,
"learning_rate": 1.9605236670755608e-05,
"loss": 0.5305,
"step": 744
},
{
"epoch": 0.3683104684217031,
"grad_norm": 0.11931933129106738,
"learning_rate": 1.9604152668317164e-05,
"loss": 0.5465,
"step": 745
},
{
"epoch": 0.36880484488938325,
"grad_norm": 0.12586876195836844,
"learning_rate": 1.9603067209658634e-05,
"loss": 0.5529,
"step": 746
},
{
"epoch": 0.3692992213570634,
"grad_norm": 0.1255209747363213,
"learning_rate": 1.9601980294944602e-05,
"loss": 0.5122,
"step": 747
},
{
"epoch": 0.3697935978247435,
"grad_norm": 0.12596832232094882,
"learning_rate": 1.9600891924339875e-05,
"loss": 0.5159,
"step": 748
},
{
"epoch": 0.37028797429242366,
"grad_norm": 0.11292846674404017,
"learning_rate": 1.9599802098009475e-05,
"loss": 0.5197,
"step": 749
},
{
"epoch": 0.3707823507601038,
"grad_norm": 0.1331491455603008,
"learning_rate": 1.9598710816118643e-05,
"loss": 0.5276,
"step": 750
},
{
"epoch": 0.37127672722778393,
"grad_norm": 0.11912450147564574,
"learning_rate": 1.9597618078832844e-05,
"loss": 0.5081,
"step": 751
},
{
"epoch": 0.3717711036954641,
"grad_norm": 0.11994098150118168,
"learning_rate": 1.9596523886317764e-05,
"loss": 0.5312,
"step": 752
},
{
"epoch": 0.37226548016314426,
"grad_norm": 0.1405345673244345,
"learning_rate": 1.9595428238739308e-05,
"loss": 0.5325,
"step": 753
},
{
"epoch": 0.3727598566308244,
"grad_norm": 0.11392427621538659,
"learning_rate": 1.95943311362636e-05,
"loss": 0.5083,
"step": 754
},
{
"epoch": 0.37325423309850453,
"grad_norm": 0.13141437002189507,
"learning_rate": 1.9593232579056996e-05,
"loss": 0.543,
"step": 755
},
{
"epoch": 0.37374860956618466,
"grad_norm": 0.10897570966546818,
"learning_rate": 1.959213256728606e-05,
"loss": 0.5011,
"step": 756
},
{
"epoch": 0.3742429860338648,
"grad_norm": 0.1253696439437347,
"learning_rate": 1.959103110111757e-05,
"loss": 0.5533,
"step": 757
},
{
"epoch": 0.37473736250154493,
"grad_norm": 0.14769479365346622,
"learning_rate": 1.958992818071855e-05,
"loss": 0.5639,
"step": 758
},
{
"epoch": 0.37523173896922507,
"grad_norm": 0.160820176951948,
"learning_rate": 1.9588823806256213e-05,
"loss": 0.5161,
"step": 759
},
{
"epoch": 0.3757261154369052,
"grad_norm": 0.11177364017811865,
"learning_rate": 1.9587717977898025e-05,
"loss": 0.4949,
"step": 760
},
{
"epoch": 0.37622049190458534,
"grad_norm": 0.12096752909563571,
"learning_rate": 1.9586610695811647e-05,
"loss": 0.5185,
"step": 761
},
{
"epoch": 0.3767148683722655,
"grad_norm": 0.1440345250043191,
"learning_rate": 1.9585501960164972e-05,
"loss": 0.5156,
"step": 762
},
{
"epoch": 0.3772092448399456,
"grad_norm": 0.1446287688174956,
"learning_rate": 1.958439177112611e-05,
"loss": 0.5241,
"step": 763
},
{
"epoch": 0.37770362130762575,
"grad_norm": 0.11444268278950813,
"learning_rate": 1.9583280128863393e-05,
"loss": 0.4877,
"step": 764
},
{
"epoch": 0.3781979977753059,
"grad_norm": 0.13611880717007074,
"learning_rate": 1.958216703354537e-05,
"loss": 0.5166,
"step": 765
},
{
"epoch": 0.378692374242986,
"grad_norm": 0.13476334786939184,
"learning_rate": 1.9581052485340815e-05,
"loss": 0.5299,
"step": 766
},
{
"epoch": 0.37918675071066615,
"grad_norm": 0.1363209736159827,
"learning_rate": 1.9579936484418726e-05,
"loss": 0.529,
"step": 767
},
{
"epoch": 0.3796811271783463,
"grad_norm": 0.1266213921378826,
"learning_rate": 1.9578819030948302e-05,
"loss": 0.5121,
"step": 768
},
{
"epoch": 0.3801755036460264,
"grad_norm": 0.14236429655716956,
"learning_rate": 1.9577700125098988e-05,
"loss": 0.5599,
"step": 769
},
{
"epoch": 0.38066988011370656,
"grad_norm": 0.1254939483989073,
"learning_rate": 1.9576579767040434e-05,
"loss": 0.5151,
"step": 770
},
{
"epoch": 0.38116425658138675,
"grad_norm": 0.11690953803977183,
"learning_rate": 1.9575457956942508e-05,
"loss": 0.4844,
"step": 771
},
{
"epoch": 0.3816586330490669,
"grad_norm": 0.11110030361502221,
"learning_rate": 1.957433469497531e-05,
"loss": 0.5076,
"step": 772
},
{
"epoch": 0.382153009516747,
"grad_norm": 0.11855472940586433,
"learning_rate": 1.9573209981309152e-05,
"loss": 0.5004,
"step": 773
},
{
"epoch": 0.38264738598442716,
"grad_norm": 0.11429391948482963,
"learning_rate": 1.9572083816114563e-05,
"loss": 0.5382,
"step": 774
},
{
"epoch": 0.3831417624521073,
"grad_norm": 0.22490900588368026,
"learning_rate": 1.95709561995623e-05,
"loss": 0.5118,
"step": 775
},
{
"epoch": 0.38363613891978743,
"grad_norm": 0.11967963393975331,
"learning_rate": 1.956982713182334e-05,
"loss": 0.5457,
"step": 776
},
{
"epoch": 0.38413051538746756,
"grad_norm": 0.11297122274946826,
"learning_rate": 1.9568696613068868e-05,
"loss": 0.5347,
"step": 777
},
{
"epoch": 0.3846248918551477,
"grad_norm": 0.11876129238381135,
"learning_rate": 1.9567564643470307e-05,
"loss": 0.5311,
"step": 778
},
{
"epoch": 0.38511926832282783,
"grad_norm": 0.14882293495769927,
"learning_rate": 1.9566431223199288e-05,
"loss": 0.5147,
"step": 779
},
{
"epoch": 0.38561364479050797,
"grad_norm": 0.11949438601582658,
"learning_rate": 1.9565296352427664e-05,
"loss": 0.5226,
"step": 780
},
{
"epoch": 0.3861080212581881,
"grad_norm": 0.11818305497786784,
"learning_rate": 1.9564160031327505e-05,
"loss": 0.5077,
"step": 781
},
{
"epoch": 0.38660239772586824,
"grad_norm": 0.15463974846708808,
"learning_rate": 1.9563022260071108e-05,
"loss": 0.5256,
"step": 782
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.6712045544071819,
"learning_rate": 1.956188303883099e-05,
"loss": 0.5215,
"step": 783
},
{
"epoch": 0.3875911506612285,
"grad_norm": 0.13357716099119674,
"learning_rate": 1.9560742367779878e-05,
"loss": 0.5169,
"step": 784
},
{
"epoch": 0.38808552712890865,
"grad_norm": 13.105647935913273,
"learning_rate": 1.9559600247090727e-05,
"loss": 0.7231,
"step": 785
},
{
"epoch": 0.3885799035965888,
"grad_norm": 0.18132324717559783,
"learning_rate": 1.9558456676936716e-05,
"loss": 0.5303,
"step": 786
},
{
"epoch": 0.3890742800642689,
"grad_norm": 0.168777667134267,
"learning_rate": 1.9557311657491226e-05,
"loss": 0.5303,
"step": 787
},
{
"epoch": 0.38956865653194905,
"grad_norm": 0.2316540325496849,
"learning_rate": 1.9556165188927882e-05,
"loss": 0.5081,
"step": 788
},
{
"epoch": 0.39006303299962924,
"grad_norm": 0.215617507726242,
"learning_rate": 1.9555017271420505e-05,
"loss": 0.5407,
"step": 789
},
{
"epoch": 0.3905574094673094,
"grad_norm": 0.16630384509853238,
"learning_rate": 1.9553867905143154e-05,
"loss": 0.5347,
"step": 790
},
{
"epoch": 0.3910517859349895,
"grad_norm": 0.17661866383139865,
"learning_rate": 1.9552717090270093e-05,
"loss": 0.535,
"step": 791
},
{
"epoch": 0.39154616240266965,
"grad_norm": 0.15850328473950456,
"learning_rate": 1.9551564826975826e-05,
"loss": 0.5318,
"step": 792
},
{
"epoch": 0.3920405388703498,
"grad_norm": 0.1625797214586048,
"learning_rate": 1.9550411115435055e-05,
"loss": 0.5128,
"step": 793
},
{
"epoch": 0.3925349153380299,
"grad_norm": 0.14383139961470717,
"learning_rate": 1.9549255955822708e-05,
"loss": 0.5508,
"step": 794
},
{
"epoch": 0.39302929180571006,
"grad_norm": 0.48900047765435783,
"learning_rate": 1.954809934831394e-05,
"loss": 0.563,
"step": 795
},
{
"epoch": 0.3935236682733902,
"grad_norm": 0.1387641990843941,
"learning_rate": 1.954694129308412e-05,
"loss": 0.52,
"step": 796
},
{
"epoch": 0.39401804474107033,
"grad_norm": 0.24201186471731123,
"learning_rate": 1.9545781790308834e-05,
"loss": 0.5082,
"step": 797
},
{
"epoch": 0.39451242120875046,
"grad_norm": 0.1486192562906134,
"learning_rate": 1.9544620840163893e-05,
"loss": 0.5278,
"step": 798
},
{
"epoch": 0.3950067976764306,
"grad_norm": 0.3021042936646202,
"learning_rate": 1.9543458442825327e-05,
"loss": 0.5588,
"step": 799
},
{
"epoch": 0.39550117414411073,
"grad_norm": 0.15227552089374463,
"learning_rate": 1.954229459846938e-05,
"loss": 0.523,
"step": 800
},
{
"epoch": 0.39599555061179087,
"grad_norm": 0.14643462519076003,
"learning_rate": 1.9541129307272516e-05,
"loss": 0.5443,
"step": 801
},
{
"epoch": 0.396489927079471,
"grad_norm": 0.13913089309069804,
"learning_rate": 1.953996256941143e-05,
"loss": 0.561,
"step": 802
},
{
"epoch": 0.39698430354715114,
"grad_norm": 0.1376395527394012,
"learning_rate": 1.9538794385063018e-05,
"loss": 0.5674,
"step": 803
},
{
"epoch": 0.3974786800148313,
"grad_norm": 0.16179168853807271,
"learning_rate": 1.953762475440441e-05,
"loss": 0.5632,
"step": 804
},
{
"epoch": 0.3979730564825114,
"grad_norm": 0.13391651965352255,
"learning_rate": 1.9536453677612947e-05,
"loss": 0.5567,
"step": 805
},
{
"epoch": 0.39846743295019155,
"grad_norm": 0.12467236090372662,
"learning_rate": 1.9535281154866195e-05,
"loss": 0.5276,
"step": 806
},
{
"epoch": 0.3989618094178717,
"grad_norm": 0.1313225152119864,
"learning_rate": 1.9534107186341938e-05,
"loss": 0.5168,
"step": 807
},
{
"epoch": 0.3994561858855519,
"grad_norm": 0.12146563630276597,
"learning_rate": 1.9532931772218175e-05,
"loss": 0.5505,
"step": 808
},
{
"epoch": 0.399950562353232,
"grad_norm": 0.12566845129031629,
"learning_rate": 1.9531754912673128e-05,
"loss": 0.5043,
"step": 809
},
{
"epoch": 0.40044493882091214,
"grad_norm": 0.13390243351327377,
"learning_rate": 1.9530576607885233e-05,
"loss": 0.5314,
"step": 810
},
{
"epoch": 0.4009393152885923,
"grad_norm": 0.1206118022336426,
"learning_rate": 1.9529396858033153e-05,
"loss": 0.5141,
"step": 811
},
{
"epoch": 0.4014336917562724,
"grad_norm": 0.12213666880531626,
"learning_rate": 1.952821566329577e-05,
"loss": 0.5424,
"step": 812
},
{
"epoch": 0.40192806822395255,
"grad_norm": 0.11710245584447163,
"learning_rate": 1.9527033023852178e-05,
"loss": 0.5174,
"step": 813
},
{
"epoch": 0.4024224446916327,
"grad_norm": 0.1308808945672156,
"learning_rate": 1.9525848939881694e-05,
"loss": 0.5537,
"step": 814
},
{
"epoch": 0.4029168211593128,
"grad_norm": 0.11796762049437635,
"learning_rate": 1.9524663411563848e-05,
"loss": 0.518,
"step": 815
},
{
"epoch": 0.40341119762699296,
"grad_norm": 0.12454388238355925,
"learning_rate": 1.9523476439078405e-05,
"loss": 0.509,
"step": 816
},
{
"epoch": 0.4039055740946731,
"grad_norm": 0.11985278986274614,
"learning_rate": 1.9522288022605332e-05,
"loss": 0.5191,
"step": 817
},
{
"epoch": 0.4043999505623532,
"grad_norm": 0.13007563694436983,
"learning_rate": 1.952109816232482e-05,
"loss": 0.5443,
"step": 818
},
{
"epoch": 0.40489432703003336,
"grad_norm": 0.13178116433983927,
"learning_rate": 1.9519906858417286e-05,
"loss": 0.5873,
"step": 819
},
{
"epoch": 0.4053887034977135,
"grad_norm": 0.11917127433844353,
"learning_rate": 1.9518714111063355e-05,
"loss": 0.5136,
"step": 820
},
{
"epoch": 0.40588307996539363,
"grad_norm": 0.14381744235612728,
"learning_rate": 1.951751992044388e-05,
"loss": 0.5691,
"step": 821
},
{
"epoch": 0.40637745643307377,
"grad_norm": 0.1266993790551995,
"learning_rate": 1.9516324286739925e-05,
"loss": 0.4994,
"step": 822
},
{
"epoch": 0.4068718329007539,
"grad_norm": 0.11879830734033153,
"learning_rate": 1.9515127210132783e-05,
"loss": 0.5027,
"step": 823
},
{
"epoch": 0.40736620936843404,
"grad_norm": 0.13009096570436296,
"learning_rate": 1.951392869080395e-05,
"loss": 0.5089,
"step": 824
},
{
"epoch": 0.4078605858361142,
"grad_norm": 0.11443257242502672,
"learning_rate": 1.9512728728935162e-05,
"loss": 0.5035,
"step": 825
},
{
"epoch": 0.40835496230379437,
"grad_norm": 0.1400264446359821,
"learning_rate": 1.9511527324708354e-05,
"loss": 0.5001,
"step": 826
},
{
"epoch": 0.4088493387714745,
"grad_norm": 0.11372896330200072,
"learning_rate": 1.9510324478305686e-05,
"loss": 0.502,
"step": 827
},
{
"epoch": 0.40934371523915464,
"grad_norm": 0.12073843736446423,
"learning_rate": 1.9509120189909544e-05,
"loss": 0.5386,
"step": 828
},
{
"epoch": 0.4098380917068348,
"grad_norm": 0.12376524044000999,
"learning_rate": 1.9507914459702526e-05,
"loss": 0.5427,
"step": 829
},
{
"epoch": 0.4103324681745149,
"grad_norm": 0.1185575075554648,
"learning_rate": 1.950670728786745e-05,
"loss": 0.5063,
"step": 830
},
{
"epoch": 0.41082684464219504,
"grad_norm": 0.1299681635913935,
"learning_rate": 1.950549867458735e-05,
"loss": 0.5125,
"step": 831
},
{
"epoch": 0.4113212211098752,
"grad_norm": 0.12769767861884082,
"learning_rate": 1.950428862004548e-05,
"loss": 0.5029,
"step": 832
},
{
"epoch": 0.4118155975775553,
"grad_norm": 0.12124601978673928,
"learning_rate": 1.9503077124425318e-05,
"loss": 0.5549,
"step": 833
},
{
"epoch": 0.41230997404523545,
"grad_norm": 0.11700031804249206,
"learning_rate": 1.9501864187910548e-05,
"loss": 0.5179,
"step": 834
},
{
"epoch": 0.4128043505129156,
"grad_norm": 0.11975068859424053,
"learning_rate": 1.950064981068509e-05,
"loss": 0.5126,
"step": 835
},
{
"epoch": 0.4132987269805957,
"grad_norm": 0.14307422563281222,
"learning_rate": 1.9499433992933067e-05,
"loss": 0.5161,
"step": 836
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.11731151792123069,
"learning_rate": 1.9498216734838825e-05,
"loss": 0.5349,
"step": 837
},
{
"epoch": 0.414287479915956,
"grad_norm": 0.12134095572797407,
"learning_rate": 1.9496998036586935e-05,
"loss": 0.4919,
"step": 838
},
{
"epoch": 0.4147818563836361,
"grad_norm": 0.11667409839621301,
"learning_rate": 1.9495777898362172e-05,
"loss": 0.4997,
"step": 839
},
{
"epoch": 0.41527623285131626,
"grad_norm": 0.11859686407599757,
"learning_rate": 1.9494556320349546e-05,
"loss": 0.5319,
"step": 840
},
{
"epoch": 0.4157706093189964,
"grad_norm": 0.12084033139279161,
"learning_rate": 1.949333330273428e-05,
"loss": 0.5067,
"step": 841
},
{
"epoch": 0.41626498578667653,
"grad_norm": 0.11250067396072169,
"learning_rate": 1.9492108845701802e-05,
"loss": 0.5172,
"step": 842
},
{
"epoch": 0.41675936225435667,
"grad_norm": 0.13978318658547215,
"learning_rate": 1.9490882949437778e-05,
"loss": 0.5158,
"step": 843
},
{
"epoch": 0.4172537387220368,
"grad_norm": 0.1186448263884901,
"learning_rate": 1.948965561412808e-05,
"loss": 0.5218,
"step": 844
},
{
"epoch": 0.417748115189717,
"grad_norm": 0.1313665555856829,
"learning_rate": 1.94884268399588e-05,
"loss": 0.5333,
"step": 845
},
{
"epoch": 0.41824249165739713,
"grad_norm": 0.12049469438961873,
"learning_rate": 1.9487196627116256e-05,
"loss": 0.5404,
"step": 846
},
{
"epoch": 0.41873686812507727,
"grad_norm": 0.12837221434519983,
"learning_rate": 1.9485964975786974e-05,
"loss": 0.5321,
"step": 847
},
{
"epoch": 0.4192312445927574,
"grad_norm": 0.12298264147573502,
"learning_rate": 1.9484731886157695e-05,
"loss": 0.539,
"step": 848
},
{
"epoch": 0.41972562106043754,
"grad_norm": 0.14349138081326696,
"learning_rate": 1.9483497358415394e-05,
"loss": 0.5341,
"step": 849
},
{
"epoch": 0.4202199975281177,
"grad_norm": 0.12990265740863893,
"learning_rate": 1.9482261392747255e-05,
"loss": 0.5117,
"step": 850
},
{
"epoch": 0.4207143739957978,
"grad_norm": 0.11667965750273937,
"learning_rate": 1.9481023989340674e-05,
"loss": 0.5349,
"step": 851
},
{
"epoch": 0.42120875046347794,
"grad_norm": 0.11904354405886217,
"learning_rate": 1.9479785148383277e-05,
"loss": 0.5202,
"step": 852
},
{
"epoch": 0.4217031269311581,
"grad_norm": 0.12569527800348054,
"learning_rate": 1.94785448700629e-05,
"loss": 0.5704,
"step": 853
},
{
"epoch": 0.4221975033988382,
"grad_norm": 0.12001234313470713,
"learning_rate": 1.9477303154567594e-05,
"loss": 0.5309,
"step": 854
},
{
"epoch": 0.42269187986651835,
"grad_norm": 0.11300292595990313,
"learning_rate": 1.9476060002085644e-05,
"loss": 0.5067,
"step": 855
},
{
"epoch": 0.4231862563341985,
"grad_norm": 0.11615002458026627,
"learning_rate": 1.947481541280553e-05,
"loss": 0.4817,
"step": 856
},
{
"epoch": 0.4236806328018786,
"grad_norm": 0.135448995049639,
"learning_rate": 1.9473569386915968e-05,
"loss": 0.5339,
"step": 857
},
{
"epoch": 0.42417500926955876,
"grad_norm": 0.1163469567915728,
"learning_rate": 1.9472321924605885e-05,
"loss": 0.5349,
"step": 858
},
{
"epoch": 0.4246693857372389,
"grad_norm": 0.120000572671405,
"learning_rate": 1.9471073026064427e-05,
"loss": 0.495,
"step": 859
},
{
"epoch": 0.425163762204919,
"grad_norm": 0.11256258714032456,
"learning_rate": 1.9469822691480952e-05,
"loss": 0.4916,
"step": 860
},
{
"epoch": 0.42565813867259916,
"grad_norm": 0.11995027237641946,
"learning_rate": 1.9468570921045046e-05,
"loss": 0.5203,
"step": 861
},
{
"epoch": 0.4261525151402793,
"grad_norm": 0.11100537900016944,
"learning_rate": 1.9467317714946503e-05,
"loss": 0.5039,
"step": 862
},
{
"epoch": 0.4266468916079595,
"grad_norm": 0.12092645307881696,
"learning_rate": 1.9466063073375342e-05,
"loss": 0.469,
"step": 863
},
{
"epoch": 0.4271412680756396,
"grad_norm": 0.11784455717038106,
"learning_rate": 1.94648069965218e-05,
"loss": 0.5098,
"step": 864
},
{
"epoch": 0.42763564454331976,
"grad_norm": 0.1160304804893519,
"learning_rate": 1.9463549484576326e-05,
"loss": 0.5138,
"step": 865
},
{
"epoch": 0.4281300210109999,
"grad_norm": 0.11110478221257013,
"learning_rate": 1.946229053772958e-05,
"loss": 0.4941,
"step": 866
},
{
"epoch": 0.42862439747868003,
"grad_norm": 0.11147439116639915,
"learning_rate": 1.9461030156172463e-05,
"loss": 0.4912,
"step": 867
},
{
"epoch": 0.42911877394636017,
"grad_norm": 0.11511505001856205,
"learning_rate": 1.9459768340096073e-05,
"loss": 0.489,
"step": 868
},
{
"epoch": 0.4296131504140403,
"grad_norm": 0.12904639097840453,
"learning_rate": 1.945850508969173e-05,
"loss": 0.5187,
"step": 869
},
{
"epoch": 0.43010752688172044,
"grad_norm": 0.18164198342287308,
"learning_rate": 1.945724040515097e-05,
"loss": 0.5393,
"step": 870
},
{
"epoch": 0.43060190334940057,
"grad_norm": 0.11992250870124227,
"learning_rate": 1.945597428666556e-05,
"loss": 0.4819,
"step": 871
},
{
"epoch": 0.4310962798170807,
"grad_norm": 0.11408775905334834,
"learning_rate": 1.9454706734427464e-05,
"loss": 0.508,
"step": 872
},
{
"epoch": 0.43159065628476084,
"grad_norm": 0.12325589735288589,
"learning_rate": 1.9453437748628875e-05,
"loss": 0.5083,
"step": 873
},
{
"epoch": 0.432085032752441,
"grad_norm": 0.11624284805073162,
"learning_rate": 1.945216732946221e-05,
"loss": 0.5916,
"step": 874
},
{
"epoch": 0.4325794092201211,
"grad_norm": 0.12680844652417092,
"learning_rate": 1.9450895477120083e-05,
"loss": 0.5232,
"step": 875
},
{
"epoch": 0.43307378568780125,
"grad_norm": 0.11496580104028518,
"learning_rate": 1.9449622191795345e-05,
"loss": 0.4937,
"step": 876
},
{
"epoch": 0.4335681621554814,
"grad_norm": 0.1519239563008219,
"learning_rate": 1.9448347473681055e-05,
"loss": 0.5492,
"step": 877
},
{
"epoch": 0.4340625386231615,
"grad_norm": 0.11254539945296084,
"learning_rate": 1.944707132297049e-05,
"loss": 0.5239,
"step": 878
},
{
"epoch": 0.43455691509084166,
"grad_norm": 0.12320712925493102,
"learning_rate": 1.944579373985715e-05,
"loss": 0.5297,
"step": 879
},
{
"epoch": 0.4350512915585218,
"grad_norm": 0.11839366859508396,
"learning_rate": 1.944451472453474e-05,
"loss": 0.5185,
"step": 880
},
{
"epoch": 0.4355456680262019,
"grad_norm": 0.11876750114376593,
"learning_rate": 1.944323427719719e-05,
"loss": 0.521,
"step": 881
},
{
"epoch": 0.4360400444938821,
"grad_norm": 0.12014143430922175,
"learning_rate": 1.944195239803865e-05,
"loss": 0.5173,
"step": 882
},
{
"epoch": 0.43653442096156225,
"grad_norm": 0.12129105881254212,
"learning_rate": 1.9440669087253484e-05,
"loss": 0.5394,
"step": 883
},
{
"epoch": 0.4370287974292424,
"grad_norm": 0.13050061369644272,
"learning_rate": 1.943938434503627e-05,
"loss": 0.4937,
"step": 884
},
{
"epoch": 0.4375231738969225,
"grad_norm": 0.10909974923846931,
"learning_rate": 1.943809817158181e-05,
"loss": 0.5225,
"step": 885
},
{
"epoch": 0.43801755036460266,
"grad_norm": 0.11852864672931993,
"learning_rate": 1.9436810567085113e-05,
"loss": 0.5069,
"step": 886
},
{
"epoch": 0.4385119268322828,
"grad_norm": 0.11714782403186975,
"learning_rate": 1.9435521531741414e-05,
"loss": 0.5422,
"step": 887
},
{
"epoch": 0.43900630329996293,
"grad_norm": 0.1133296037621821,
"learning_rate": 1.9434231065746165e-05,
"loss": 0.5031,
"step": 888
},
{
"epoch": 0.43950067976764307,
"grad_norm": 0.12360844033531326,
"learning_rate": 1.9432939169295023e-05,
"loss": 0.5269,
"step": 889
},
{
"epoch": 0.4399950562353232,
"grad_norm": 0.10649821307755614,
"learning_rate": 1.9431645842583878e-05,
"loss": 0.4887,
"step": 890
},
{
"epoch": 0.44048943270300334,
"grad_norm": 0.11559128642076653,
"learning_rate": 1.9430351085808824e-05,
"loss": 0.4893,
"step": 891
},
{
"epoch": 0.44098380917068347,
"grad_norm": 0.11286781660134543,
"learning_rate": 1.9429054899166183e-05,
"loss": 0.5101,
"step": 892
},
{
"epoch": 0.4414781856383636,
"grad_norm": 0.11670583057661042,
"learning_rate": 1.9427757282852483e-05,
"loss": 0.5043,
"step": 893
},
{
"epoch": 0.44197256210604374,
"grad_norm": 0.12689444725473112,
"learning_rate": 1.9426458237064477e-05,
"loss": 0.4964,
"step": 894
},
{
"epoch": 0.4424669385737239,
"grad_norm": 0.1175352009525058,
"learning_rate": 1.942515776199913e-05,
"loss": 0.4859,
"step": 895
},
{
"epoch": 0.442961315041404,
"grad_norm": 0.11743708905038726,
"learning_rate": 1.942385585785363e-05,
"loss": 0.5223,
"step": 896
},
{
"epoch": 0.44345569150908415,
"grad_norm": 0.1135847413415584,
"learning_rate": 1.9422552524825366e-05,
"loss": 0.5111,
"step": 897
},
{
"epoch": 0.4439500679767643,
"grad_norm": 0.12033937117356377,
"learning_rate": 1.942124776311196e-05,
"loss": 0.5024,
"step": 898
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.10868608365182751,
"learning_rate": 1.941994157291125e-05,
"loss": 0.5129,
"step": 899
},
{
"epoch": 0.44493882091212456,
"grad_norm": 0.12391912980999625,
"learning_rate": 1.9418633954421283e-05,
"loss": 0.5418,
"step": 900
},
{
"epoch": 0.44543319737980475,
"grad_norm": 0.11314437962107442,
"learning_rate": 1.941732490784032e-05,
"loss": 0.5237,
"step": 901
},
{
"epoch": 0.4459275738474849,
"grad_norm": 0.11925279876590858,
"learning_rate": 1.9416014433366857e-05,
"loss": 0.5072,
"step": 902
},
{
"epoch": 0.446421950315165,
"grad_norm": 0.11237003717294791,
"learning_rate": 1.9414702531199577e-05,
"loss": 0.4851,
"step": 903
},
{
"epoch": 0.44691632678284515,
"grad_norm": 0.11682895944475345,
"learning_rate": 1.9413389201537405e-05,
"loss": 0.5021,
"step": 904
},
{
"epoch": 0.4474107032505253,
"grad_norm": 0.12844124807021137,
"learning_rate": 1.9412074444579475e-05,
"loss": 0.5702,
"step": 905
},
{
"epoch": 0.4479050797182054,
"grad_norm": 0.10670867342591153,
"learning_rate": 1.9410758260525128e-05,
"loss": 0.4644,
"step": 906
},
{
"epoch": 0.44839945618588556,
"grad_norm": 0.11909755744331701,
"learning_rate": 1.9409440649573935e-05,
"loss": 0.5068,
"step": 907
},
{
"epoch": 0.4488938326535657,
"grad_norm": 0.11046634657067686,
"learning_rate": 1.9408121611925677e-05,
"loss": 0.5195,
"step": 908
},
{
"epoch": 0.44938820912124583,
"grad_norm": 0.11825207651457217,
"learning_rate": 1.940680114778035e-05,
"loss": 0.5107,
"step": 909
},
{
"epoch": 0.44988258558892597,
"grad_norm": 0.11016360074491491,
"learning_rate": 1.940547925733817e-05,
"loss": 0.5237,
"step": 910
},
{
"epoch": 0.4503769620566061,
"grad_norm": 0.12023596756590303,
"learning_rate": 1.9404155940799566e-05,
"loss": 0.5399,
"step": 911
},
{
"epoch": 0.45087133852428624,
"grad_norm": 0.12083711351101563,
"learning_rate": 1.940283119836518e-05,
"loss": 0.5074,
"step": 912
},
{
"epoch": 0.45136571499196637,
"grad_norm": 0.11462166373514279,
"learning_rate": 1.940150503023589e-05,
"loss": 0.4992,
"step": 913
},
{
"epoch": 0.4518600914596465,
"grad_norm": 0.1214226740710518,
"learning_rate": 1.9400177436612756e-05,
"loss": 0.5155,
"step": 914
},
{
"epoch": 0.45235446792732664,
"grad_norm": 0.12328412434940575,
"learning_rate": 1.9398848417697086e-05,
"loss": 0.5157,
"step": 915
},
{
"epoch": 0.4528488443950068,
"grad_norm": 0.11252450966296666,
"learning_rate": 1.9397517973690382e-05,
"loss": 0.5096,
"step": 916
},
{
"epoch": 0.4533432208626869,
"grad_norm": 0.1266653824111574,
"learning_rate": 1.9396186104794378e-05,
"loss": 0.5417,
"step": 917
},
{
"epoch": 0.45383759733036705,
"grad_norm": 0.11604235331034492,
"learning_rate": 1.9394852811211014e-05,
"loss": 0.5262,
"step": 918
},
{
"epoch": 0.45433197379804724,
"grad_norm": 0.14371530323121903,
"learning_rate": 1.9393518093142453e-05,
"loss": 0.5083,
"step": 919
},
{
"epoch": 0.4548263502657274,
"grad_norm": 0.13438780093189526,
"learning_rate": 1.939218195079107e-05,
"loss": 0.5178,
"step": 920
},
{
"epoch": 0.4553207267334075,
"grad_norm": 0.11797536707037623,
"learning_rate": 1.939084438435945e-05,
"loss": 0.5374,
"step": 921
},
{
"epoch": 0.45581510320108765,
"grad_norm": 0.12984832601327378,
"learning_rate": 1.9389505394050405e-05,
"loss": 0.548,
"step": 922
},
{
"epoch": 0.4563094796687678,
"grad_norm": 0.12888643882767423,
"learning_rate": 1.9388164980066956e-05,
"loss": 0.5242,
"step": 923
},
{
"epoch": 0.4568038561364479,
"grad_norm": 0.13201836801637803,
"learning_rate": 1.9386823142612347e-05,
"loss": 0.5442,
"step": 924
},
{
"epoch": 0.45729823260412805,
"grad_norm": 0.11602934121400964,
"learning_rate": 1.938547988189003e-05,
"loss": 0.5345,
"step": 925
},
{
"epoch": 0.4577926090718082,
"grad_norm": 0.12535686830382267,
"learning_rate": 1.938413519810367e-05,
"loss": 0.5179,
"step": 926
},
{
"epoch": 0.4582869855394883,
"grad_norm": 0.12047560422808079,
"learning_rate": 1.938278909145716e-05,
"loss": 0.5374,
"step": 927
},
{
"epoch": 0.45878136200716846,
"grad_norm": 0.11456824701139671,
"learning_rate": 1.93814415621546e-05,
"loss": 0.5234,
"step": 928
},
{
"epoch": 0.4592757384748486,
"grad_norm": 0.13143209560339533,
"learning_rate": 1.9380092610400306e-05,
"loss": 0.4991,
"step": 929
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.11876519232726224,
"learning_rate": 1.9378742236398818e-05,
"loss": 0.5122,
"step": 930
},
{
"epoch": 0.46026449141020886,
"grad_norm": 0.12051865077257794,
"learning_rate": 1.9377390440354877e-05,
"loss": 0.5847,
"step": 931
},
{
"epoch": 0.460758867877889,
"grad_norm": 0.12527379992127805,
"learning_rate": 1.937603722247345e-05,
"loss": 0.5154,
"step": 932
},
{
"epoch": 0.46125324434556914,
"grad_norm": 0.19574159290389473,
"learning_rate": 1.937468258295972e-05,
"loss": 0.5626,
"step": 933
},
{
"epoch": 0.46174762081324927,
"grad_norm": 0.11955553132419712,
"learning_rate": 1.937332652201908e-05,
"loss": 0.5071,
"step": 934
},
{
"epoch": 0.4622419972809294,
"grad_norm": 0.12044990721783748,
"learning_rate": 1.9371969039857144e-05,
"loss": 0.509,
"step": 935
},
{
"epoch": 0.46273637374860954,
"grad_norm": 0.11620454296270502,
"learning_rate": 1.9370610136679738e-05,
"loss": 0.5208,
"step": 936
},
{
"epoch": 0.4632307502162897,
"grad_norm": 0.12631918479999057,
"learning_rate": 1.93692498126929e-05,
"loss": 0.5133,
"step": 937
},
{
"epoch": 0.46372512668396987,
"grad_norm": 0.12421183098996347,
"learning_rate": 1.9367888068102898e-05,
"loss": 0.5025,
"step": 938
},
{
"epoch": 0.46421950315165,
"grad_norm": 0.11400568899304779,
"learning_rate": 1.93665249031162e-05,
"loss": 0.5108,
"step": 939
},
{
"epoch": 0.46471387961933014,
"grad_norm": 0.12347156326810234,
"learning_rate": 1.9365160317939488e-05,
"loss": 0.5112,
"step": 940
},
{
"epoch": 0.4652082560870103,
"grad_norm": 0.1204391843966653,
"learning_rate": 1.936379431277967e-05,
"loss": 0.5233,
"step": 941
},
{
"epoch": 0.4657026325546904,
"grad_norm": 0.11501116487070531,
"learning_rate": 1.936242688784387e-05,
"loss": 0.4711,
"step": 942
},
{
"epoch": 0.46619700902237055,
"grad_norm": 0.12364077725843484,
"learning_rate": 1.936105804333942e-05,
"loss": 0.5208,
"step": 943
},
{
"epoch": 0.4666913854900507,
"grad_norm": 0.11623792253164529,
"learning_rate": 1.9359687779473865e-05,
"loss": 0.521,
"step": 944
},
{
"epoch": 0.4671857619577308,
"grad_norm": 0.11943595147454014,
"learning_rate": 1.9358316096454977e-05,
"loss": 0.5483,
"step": 945
},
{
"epoch": 0.46768013842541095,
"grad_norm": 0.11950906985330814,
"learning_rate": 1.9356942994490727e-05,
"loss": 0.4987,
"step": 946
},
{
"epoch": 0.4681745148930911,
"grad_norm": 0.11838653769283976,
"learning_rate": 1.935556847378932e-05,
"loss": 0.5175,
"step": 947
},
{
"epoch": 0.4686688913607712,
"grad_norm": 0.12826526708090183,
"learning_rate": 1.9354192534559162e-05,
"loss": 0.5179,
"step": 948
},
{
"epoch": 0.46916326782845136,
"grad_norm": 0.11418987770785785,
"learning_rate": 1.935281517700888e-05,
"loss": 0.5027,
"step": 949
},
{
"epoch": 0.4696576442961315,
"grad_norm": 0.14002001902397582,
"learning_rate": 1.9351436401347308e-05,
"loss": 0.5501,
"step": 950
},
{
"epoch": 0.47015202076381163,
"grad_norm": 0.11068913261543604,
"learning_rate": 1.935005620778351e-05,
"loss": 0.5129,
"step": 951
},
{
"epoch": 0.47064639723149176,
"grad_norm": 0.12871194352943152,
"learning_rate": 1.9348674596526753e-05,
"loss": 0.5118,
"step": 952
},
{
"epoch": 0.4711407736991719,
"grad_norm": 0.11805267123464908,
"learning_rate": 1.9347291567786522e-05,
"loss": 0.5204,
"step": 953
},
{
"epoch": 0.47163515016685204,
"grad_norm": 0.11991508245127702,
"learning_rate": 1.9345907121772516e-05,
"loss": 0.5151,
"step": 954
},
{
"epoch": 0.47212952663453217,
"grad_norm": 0.1145533452145342,
"learning_rate": 1.9344521258694655e-05,
"loss": 0.5098,
"step": 955
},
{
"epoch": 0.47262390310221236,
"grad_norm": 0.11929393482354868,
"learning_rate": 1.9343133978763062e-05,
"loss": 0.5011,
"step": 956
},
{
"epoch": 0.4731182795698925,
"grad_norm": 0.11663973654637984,
"learning_rate": 1.934174528218809e-05,
"loss": 0.4937,
"step": 957
},
{
"epoch": 0.47361265603757263,
"grad_norm": 0.11027248950786549,
"learning_rate": 1.934035516918029e-05,
"loss": 0.5049,
"step": 958
},
{
"epoch": 0.47410703250525277,
"grad_norm": 0.12105246416468389,
"learning_rate": 1.933896363995045e-05,
"loss": 0.5345,
"step": 959
},
{
"epoch": 0.4746014089729329,
"grad_norm": 0.11490728554190888,
"learning_rate": 1.933757069470954e-05,
"loss": 0.5189,
"step": 960
},
{
"epoch": 0.47509578544061304,
"grad_norm": 0.11828963859138435,
"learning_rate": 1.9336176333668783e-05,
"loss": 0.5545,
"step": 961
},
{
"epoch": 0.4755901619082932,
"grad_norm": 0.11400053090650882,
"learning_rate": 1.933478055703958e-05,
"loss": 0.5352,
"step": 962
},
{
"epoch": 0.4760845383759733,
"grad_norm": 2.1528054779569943,
"learning_rate": 1.9333383365033582e-05,
"loss": 0.5964,
"step": 963
},
{
"epoch": 0.47657891484365345,
"grad_norm": 0.15540474771564156,
"learning_rate": 1.9331984757862625e-05,
"loss": 0.5245,
"step": 964
},
{
"epoch": 0.4770732913113336,
"grad_norm": 0.12837769813119723,
"learning_rate": 1.933058473573877e-05,
"loss": 0.5119,
"step": 965
},
{
"epoch": 0.4775676677790137,
"grad_norm": 0.10918817662814939,
"learning_rate": 1.9329183298874303e-05,
"loss": 0.4869,
"step": 966
},
{
"epoch": 0.47806204424669385,
"grad_norm": 0.12984362223526158,
"learning_rate": 1.9327780447481712e-05,
"loss": 0.5316,
"step": 967
},
{
"epoch": 0.478556420714374,
"grad_norm": 0.12808152692318933,
"learning_rate": 1.93263761817737e-05,
"loss": 0.5351,
"step": 968
},
{
"epoch": 0.4790507971820541,
"grad_norm": 0.12580422378483894,
"learning_rate": 1.932497050196319e-05,
"loss": 0.5101,
"step": 969
},
{
"epoch": 0.47954517364973426,
"grad_norm": 0.12555118104803945,
"learning_rate": 1.9323563408263316e-05,
"loss": 0.5056,
"step": 970
},
{
"epoch": 0.4800395501174144,
"grad_norm": 0.5200541122835926,
"learning_rate": 1.9322154900887428e-05,
"loss": 0.5482,
"step": 971
},
{
"epoch": 0.48053392658509453,
"grad_norm": 0.1137002988166742,
"learning_rate": 1.9320744980049087e-05,
"loss": 0.5404,
"step": 972
},
{
"epoch": 0.48102830305277466,
"grad_norm": 0.11639078878132454,
"learning_rate": 1.9319333645962074e-05,
"loss": 0.5255,
"step": 973
},
{
"epoch": 0.4815226795204548,
"grad_norm": 0.6173263827514818,
"learning_rate": 1.9317920898840377e-05,
"loss": 0.6137,
"step": 974
},
{
"epoch": 0.482017055988135,
"grad_norm": 0.12648494370267793,
"learning_rate": 1.9316506738898207e-05,
"loss": 0.4795,
"step": 975
},
{
"epoch": 0.4825114324558151,
"grad_norm": 0.12606201569349051,
"learning_rate": 1.9315091166349982e-05,
"loss": 0.5044,
"step": 976
},
{
"epoch": 0.48300580892349526,
"grad_norm": 0.11879346781551944,
"learning_rate": 1.931367418141034e-05,
"loss": 0.514,
"step": 977
},
{
"epoch": 0.4835001853911754,
"grad_norm": 0.1284018609333871,
"learning_rate": 1.9312255784294127e-05,
"loss": 0.4982,
"step": 978
},
{
"epoch": 0.48399456185885553,
"grad_norm": 0.11391645198723178,
"learning_rate": 1.9310835975216405e-05,
"loss": 0.5106,
"step": 979
},
{
"epoch": 0.48448893832653567,
"grad_norm": 0.12222206957623863,
"learning_rate": 1.9309414754392452e-05,
"loss": 0.5322,
"step": 980
},
{
"epoch": 0.4849833147942158,
"grad_norm": 0.12729794887602738,
"learning_rate": 1.930799212203776e-05,
"loss": 0.5476,
"step": 981
},
{
"epoch": 0.48547769126189594,
"grad_norm": 0.40764102896065885,
"learning_rate": 1.930656807836804e-05,
"loss": 0.5603,
"step": 982
},
{
"epoch": 0.4859720677295761,
"grad_norm": 0.12137960632045262,
"learning_rate": 1.9305142623599196e-05,
"loss": 0.5341,
"step": 983
},
{
"epoch": 0.4864664441972562,
"grad_norm": 0.12650635370075572,
"learning_rate": 1.9303715757947376e-05,
"loss": 0.5175,
"step": 984
},
{
"epoch": 0.48696082066493634,
"grad_norm": 0.11324751878536045,
"learning_rate": 1.9302287481628918e-05,
"loss": 0.5127,
"step": 985
},
{
"epoch": 0.4874551971326165,
"grad_norm": 0.12957210201675948,
"learning_rate": 1.930085779486039e-05,
"loss": 0.5274,
"step": 986
},
{
"epoch": 0.4879495736002966,
"grad_norm": 0.19274656860858594,
"learning_rate": 1.9299426697858558e-05,
"loss": 0.5201,
"step": 987
},
{
"epoch": 0.48844395006797675,
"grad_norm": 0.11246976456257915,
"learning_rate": 1.9297994190840424e-05,
"loss": 0.4935,
"step": 988
},
{
"epoch": 0.4889383265356569,
"grad_norm": 0.11936698071208242,
"learning_rate": 1.9296560274023176e-05,
"loss": 0.4897,
"step": 989
},
{
"epoch": 0.489432703003337,
"grad_norm": 0.1216082490877951,
"learning_rate": 1.929512494762424e-05,
"loss": 0.5341,
"step": 990
},
{
"epoch": 0.48992707947101716,
"grad_norm": 0.11335898255591419,
"learning_rate": 1.9293688211861238e-05,
"loss": 0.536,
"step": 991
},
{
"epoch": 0.4904214559386973,
"grad_norm": 0.12256390057442543,
"learning_rate": 1.9292250066952023e-05,
"loss": 0.5161,
"step": 992
},
{
"epoch": 0.4909158324063775,
"grad_norm": 0.13002997169637734,
"learning_rate": 1.9290810513114645e-05,
"loss": 0.5264,
"step": 993
},
{
"epoch": 0.4914102088740576,
"grad_norm": 0.14079447127832895,
"learning_rate": 1.9289369550567378e-05,
"loss": 0.5311,
"step": 994
},
{
"epoch": 0.49190458534173775,
"grad_norm": 0.16926830960351885,
"learning_rate": 1.9287927179528707e-05,
"loss": 0.528,
"step": 995
},
{
"epoch": 0.4923989618094179,
"grad_norm": 0.11922510855628013,
"learning_rate": 1.9286483400217327e-05,
"loss": 0.5234,
"step": 996
},
{
"epoch": 0.492893338277098,
"grad_norm": 0.11961232413577055,
"learning_rate": 1.9285038212852153e-05,
"loss": 0.5078,
"step": 997
},
{
"epoch": 0.49338771474477816,
"grad_norm": 0.20966114305617783,
"learning_rate": 1.9283591617652307e-05,
"loss": 0.5304,
"step": 998
},
{
"epoch": 0.4938820912124583,
"grad_norm": 0.12885494777163334,
"learning_rate": 1.928214361483713e-05,
"loss": 0.52,
"step": 999
},
{
"epoch": 0.49437646768013843,
"grad_norm": 0.12764785858866287,
"learning_rate": 1.9280694204626172e-05,
"loss": 0.5385,
"step": 1000
},
{
"epoch": 0.49487084414781857,
"grad_norm": 0.11770050118991784,
"learning_rate": 1.9279243387239202e-05,
"loss": 0.5165,
"step": 1001
},
{
"epoch": 0.4953652206154987,
"grad_norm": 0.12112646239319945,
"learning_rate": 1.9277791162896195e-05,
"loss": 0.5319,
"step": 1002
},
{
"epoch": 0.49585959708317884,
"grad_norm": 0.1191245922492599,
"learning_rate": 1.9276337531817346e-05,
"loss": 0.5128,
"step": 1003
},
{
"epoch": 0.496353973550859,
"grad_norm": 0.1337623469478305,
"learning_rate": 1.927488249422306e-05,
"loss": 0.5062,
"step": 1004
},
{
"epoch": 0.4968483500185391,
"grad_norm": 0.11879730151474972,
"learning_rate": 1.927342605033395e-05,
"loss": 0.5196,
"step": 1005
},
{
"epoch": 0.49734272648621924,
"grad_norm": 0.13218878372649623,
"learning_rate": 1.9271968200370855e-05,
"loss": 0.5182,
"step": 1006
},
{
"epoch": 0.4978371029538994,
"grad_norm": 0.13013769402860653,
"learning_rate": 1.9270508944554815e-05,
"loss": 0.5051,
"step": 1007
},
{
"epoch": 0.4983314794215795,
"grad_norm": 0.1197074067625584,
"learning_rate": 1.926904828310709e-05,
"loss": 0.5193,
"step": 1008
},
{
"epoch": 0.49882585588925965,
"grad_norm": 0.31275690195328326,
"learning_rate": 1.926758621624915e-05,
"loss": 0.4899,
"step": 1009
},
{
"epoch": 0.4993202323569398,
"grad_norm": 0.13343897612287375,
"learning_rate": 1.926612274420269e-05,
"loss": 0.5246,
"step": 1010
},
{
"epoch": 0.4998146088246199,
"grad_norm": 0.12338259348857178,
"learning_rate": 1.9264657867189595e-05,
"loss": 0.5096,
"step": 1011
},
{
"epoch": 0.5003089852923001,
"grad_norm": 0.11882390653589099,
"learning_rate": 1.9263191585431972e-05,
"loss": 0.5076,
"step": 1012
},
{
"epoch": 0.5003089852923001,
"eval_loss": 0.521454393863678,
"eval_runtime": 101.0519,
"eval_samples_per_second": 300.38,
"eval_steps_per_second": 37.555,
"step": 1012
},
{
"epoch": 0.5008033617599802,
"grad_norm": 0.11850474705374807,
"learning_rate": 1.926172389915216e-05,
"loss": 0.4929,
"step": 1013
},
{
"epoch": 0.5012977382276603,
"grad_norm": 0.12064139299632978,
"learning_rate": 1.9260254808572685e-05,
"loss": 0.5238,
"step": 1014
},
{
"epoch": 0.5017921146953405,
"grad_norm": 0.13055490951144988,
"learning_rate": 1.9258784313916298e-05,
"loss": 0.5107,
"step": 1015
},
{
"epoch": 0.5022864911630206,
"grad_norm": 0.11207645756074343,
"learning_rate": 1.9257312415405963e-05,
"loss": 0.5112,
"step": 1016
},
{
"epoch": 0.5027808676307007,
"grad_norm": 0.3177430802146852,
"learning_rate": 1.9255839113264852e-05,
"loss": 0.5303,
"step": 1017
},
{
"epoch": 0.5032752440983809,
"grad_norm": 0.12101582311348857,
"learning_rate": 1.9254364407716356e-05,
"loss": 0.5043,
"step": 1018
},
{
"epoch": 0.503769620566061,
"grad_norm": 0.11428750852874056,
"learning_rate": 1.9252888298984077e-05,
"loss": 0.5356,
"step": 1019
},
{
"epoch": 0.5042639970337411,
"grad_norm": 0.1247289364346744,
"learning_rate": 1.9251410787291826e-05,
"loss": 0.5063,
"step": 1020
},
{
"epoch": 0.5047583735014214,
"grad_norm": 0.11948774519068592,
"learning_rate": 1.9249931872863625e-05,
"loss": 0.5114,
"step": 1021
},
{
"epoch": 0.5052527499691015,
"grad_norm": 0.11985658979068335,
"learning_rate": 1.924845155592372e-05,
"loss": 0.4833,
"step": 1022
},
{
"epoch": 0.5057471264367817,
"grad_norm": 0.11115703529580384,
"learning_rate": 1.924696983669656e-05,
"loss": 0.5152,
"step": 1023
},
{
"epoch": 0.5062415029044618,
"grad_norm": 0.11802388815181424,
"learning_rate": 1.924548671540681e-05,
"loss": 0.5138,
"step": 1024
},
{
"epoch": 0.5067358793721419,
"grad_norm": 0.10976487055926493,
"learning_rate": 1.9244002192279345e-05,
"loss": 0.521,
"step": 1025
},
{
"epoch": 0.5072302558398221,
"grad_norm": 0.12385573492216018,
"learning_rate": 1.9242516267539257e-05,
"loss": 0.5082,
"step": 1026
},
{
"epoch": 0.5077246323075022,
"grad_norm": 0.11004973829551619,
"learning_rate": 1.9241028941411846e-05,
"loss": 0.5096,
"step": 1027
},
{
"epoch": 0.5082190087751823,
"grad_norm": 0.1158278747669963,
"learning_rate": 1.9239540214122625e-05,
"loss": 0.4706,
"step": 1028
},
{
"epoch": 0.5087133852428625,
"grad_norm": 0.1217653346360588,
"learning_rate": 1.9238050085897324e-05,
"loss": 0.544,
"step": 1029
},
{
"epoch": 0.5092077617105426,
"grad_norm": 0.1132649397869388,
"learning_rate": 1.923655855696188e-05,
"loss": 0.5081,
"step": 1030
},
{
"epoch": 0.5097021381782227,
"grad_norm": 0.12799144686990507,
"learning_rate": 1.9235065627542444e-05,
"loss": 0.498,
"step": 1031
},
{
"epoch": 0.5101965146459029,
"grad_norm": 0.11024898843492398,
"learning_rate": 1.9233571297865383e-05,
"loss": 0.5426,
"step": 1032
},
{
"epoch": 0.510690891113583,
"grad_norm": 0.11379348694529834,
"learning_rate": 1.923207556815727e-05,
"loss": 0.5045,
"step": 1033
},
{
"epoch": 0.5111852675812631,
"grad_norm": 0.1198346207009853,
"learning_rate": 1.9230578438644897e-05,
"loss": 0.5311,
"step": 1034
},
{
"epoch": 0.5116796440489433,
"grad_norm": 0.14177245970737973,
"learning_rate": 1.9229079909555262e-05,
"loss": 0.5089,
"step": 1035
},
{
"epoch": 0.5121740205166234,
"grad_norm": 0.12186803379623583,
"learning_rate": 1.9227579981115577e-05,
"loss": 0.5622,
"step": 1036
},
{
"epoch": 0.5126683969843036,
"grad_norm": 0.3867690270694229,
"learning_rate": 1.922607865355327e-05,
"loss": 0.516,
"step": 1037
},
{
"epoch": 0.5131627734519837,
"grad_norm": 0.12127744608072127,
"learning_rate": 1.9224575927095976e-05,
"loss": 0.5366,
"step": 1038
},
{
"epoch": 0.5136571499196638,
"grad_norm": 0.18311459176052008,
"learning_rate": 1.9223071801971546e-05,
"loss": 0.5292,
"step": 1039
},
{
"epoch": 0.514151526387344,
"grad_norm": 0.11258431329081382,
"learning_rate": 1.922156627840804e-05,
"loss": 0.4832,
"step": 1040
},
{
"epoch": 0.5146459028550241,
"grad_norm": 0.11939499945052506,
"learning_rate": 1.9220059356633736e-05,
"loss": 0.5408,
"step": 1041
},
{
"epoch": 0.5151402793227042,
"grad_norm": 0.1927815770098612,
"learning_rate": 1.9218551036877113e-05,
"loss": 0.5084,
"step": 1042
},
{
"epoch": 0.5156346557903844,
"grad_norm": 0.20412650502902638,
"learning_rate": 1.9217041319366872e-05,
"loss": 0.5076,
"step": 1043
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.13426082785528162,
"learning_rate": 1.921553020433192e-05,
"loss": 0.5116,
"step": 1044
},
{
"epoch": 0.5166234087257446,
"grad_norm": 0.1261624182372947,
"learning_rate": 1.9214017692001384e-05,
"loss": 0.5294,
"step": 1045
},
{
"epoch": 0.5171177851934248,
"grad_norm": 0.12841191748669353,
"learning_rate": 1.921250378260459e-05,
"loss": 0.5483,
"step": 1046
},
{
"epoch": 0.5176121616611049,
"grad_norm": 0.13051113805269615,
"learning_rate": 1.921098847637109e-05,
"loss": 0.5467,
"step": 1047
},
{
"epoch": 0.518106538128785,
"grad_norm": 0.11682383470762873,
"learning_rate": 1.9209471773530634e-05,
"loss": 0.49,
"step": 1048
},
{
"epoch": 0.5186009145964652,
"grad_norm": 0.13283401897940134,
"learning_rate": 1.9207953674313193e-05,
"loss": 0.537,
"step": 1049
},
{
"epoch": 0.5190952910641453,
"grad_norm": 0.12528267503069854,
"learning_rate": 1.920643417894895e-05,
"loss": 0.5,
"step": 1050
},
{
"epoch": 0.5195896675318254,
"grad_norm": 0.136000269772083,
"learning_rate": 1.9204913287668295e-05,
"loss": 0.5622,
"step": 1051
},
{
"epoch": 0.5200840439995056,
"grad_norm": 0.1350803513655791,
"learning_rate": 1.9203391000701833e-05,
"loss": 0.5017,
"step": 1052
},
{
"epoch": 0.5205784204671857,
"grad_norm": 0.1220822117182817,
"learning_rate": 1.9201867318280375e-05,
"loss": 0.5161,
"step": 1053
},
{
"epoch": 0.5210727969348659,
"grad_norm": 0.11983445612313459,
"learning_rate": 1.9200342240634953e-05,
"loss": 0.4967,
"step": 1054
},
{
"epoch": 0.521567173402546,
"grad_norm": 0.12357411547160939,
"learning_rate": 1.9198815767996802e-05,
"loss": 0.522,
"step": 1055
},
{
"epoch": 0.5220615498702261,
"grad_norm": 0.313595609202733,
"learning_rate": 1.919728790059737e-05,
"loss": 0.5555,
"step": 1056
},
{
"epoch": 0.5225559263379063,
"grad_norm": 0.12074149555364364,
"learning_rate": 1.9195758638668326e-05,
"loss": 0.5417,
"step": 1057
},
{
"epoch": 0.5230503028055865,
"grad_norm": 0.1238837782900206,
"learning_rate": 1.9194227982441535e-05,
"loss": 0.5091,
"step": 1058
},
{
"epoch": 0.5235446792732666,
"grad_norm": 0.12866590779420056,
"learning_rate": 1.919269593214909e-05,
"loss": 0.5369,
"step": 1059
},
{
"epoch": 0.5240390557409468,
"grad_norm": 0.20520811791121957,
"learning_rate": 1.9191162488023277e-05,
"loss": 0.5641,
"step": 1060
},
{
"epoch": 0.5245334322086269,
"grad_norm": 0.1141457269947461,
"learning_rate": 1.9189627650296603e-05,
"loss": 0.5201,
"step": 1061
},
{
"epoch": 0.525027808676307,
"grad_norm": 0.12254361630335152,
"learning_rate": 1.9188091419201795e-05,
"loss": 0.5102,
"step": 1062
},
{
"epoch": 0.5255221851439872,
"grad_norm": 0.13929790496839506,
"learning_rate": 1.9186553794971776e-05,
"loss": 0.5201,
"step": 1063
},
{
"epoch": 0.5260165616116673,
"grad_norm": 0.12213112268446497,
"learning_rate": 1.918501477783969e-05,
"loss": 0.5104,
"step": 1064
},
{
"epoch": 0.5265109380793475,
"grad_norm": 0.1356359581954108,
"learning_rate": 1.9183474368038884e-05,
"loss": 0.5758,
"step": 1065
},
{
"epoch": 0.5270053145470276,
"grad_norm": 0.11205240607929501,
"learning_rate": 1.918193256580293e-05,
"loss": 0.5401,
"step": 1066
},
{
"epoch": 0.5274996910147077,
"grad_norm": 0.12380325116894934,
"learning_rate": 1.9180389371365594e-05,
"loss": 0.5126,
"step": 1067
},
{
"epoch": 0.5279940674823879,
"grad_norm": 0.11416467388545735,
"learning_rate": 1.917884478496086e-05,
"loss": 0.5117,
"step": 1068
},
{
"epoch": 0.528488443950068,
"grad_norm": 0.14998548779921173,
"learning_rate": 1.9177298806822933e-05,
"loss": 0.5226,
"step": 1069
},
{
"epoch": 0.5289828204177481,
"grad_norm": 0.11332937345053441,
"learning_rate": 1.9175751437186213e-05,
"loss": 0.5171,
"step": 1070
},
{
"epoch": 0.5294771968854283,
"grad_norm": 0.11607320147381389,
"learning_rate": 1.9174202676285324e-05,
"loss": 0.5343,
"step": 1071
},
{
"epoch": 0.5299715733531084,
"grad_norm": 0.11587776106905998,
"learning_rate": 1.917265252435509e-05,
"loss": 0.5026,
"step": 1072
},
{
"epoch": 0.5304659498207885,
"grad_norm": 0.11407937569726058,
"learning_rate": 1.9171100981630555e-05,
"loss": 0.5257,
"step": 1073
},
{
"epoch": 0.5309603262884687,
"grad_norm": 0.11687247074629664,
"learning_rate": 1.9169548048346968e-05,
"loss": 0.5168,
"step": 1074
},
{
"epoch": 0.5314547027561488,
"grad_norm": 0.11613310024462452,
"learning_rate": 1.916799372473979e-05,
"loss": 0.5298,
"step": 1075
},
{
"epoch": 0.531949079223829,
"grad_norm": 0.1268412149412109,
"learning_rate": 1.91664380110447e-05,
"loss": 0.4939,
"step": 1076
},
{
"epoch": 0.5324434556915091,
"grad_norm": 0.13345804698219815,
"learning_rate": 1.9164880907497576e-05,
"loss": 0.523,
"step": 1077
},
{
"epoch": 0.5329378321591892,
"grad_norm": 0.11288834884371653,
"learning_rate": 1.9163322414334515e-05,
"loss": 0.5598,
"step": 1078
},
{
"epoch": 0.5334322086268694,
"grad_norm": 0.12184496707638118,
"learning_rate": 1.9161762531791814e-05,
"loss": 0.5228,
"step": 1079
},
{
"epoch": 0.5339265850945495,
"grad_norm": 0.1214562276212525,
"learning_rate": 1.9160201260106e-05,
"loss": 0.5037,
"step": 1080
},
{
"epoch": 0.5344209615622296,
"grad_norm": 0.11999136693288817,
"learning_rate": 1.9158638599513793e-05,
"loss": 0.5336,
"step": 1081
},
{
"epoch": 0.5349153380299098,
"grad_norm": 0.11288026286769347,
"learning_rate": 1.915707455025213e-05,
"loss": 0.514,
"step": 1082
},
{
"epoch": 0.5354097144975899,
"grad_norm": 0.10870804704101308,
"learning_rate": 1.915550911255816e-05,
"loss": 0.4803,
"step": 1083
},
{
"epoch": 0.53590409096527,
"grad_norm": 0.11260792079394162,
"learning_rate": 1.9153942286669242e-05,
"loss": 0.5274,
"step": 1084
},
{
"epoch": 0.5363984674329502,
"grad_norm": 0.12081545111205375,
"learning_rate": 1.9152374072822945e-05,
"loss": 0.507,
"step": 1085
},
{
"epoch": 0.5368928439006303,
"grad_norm": 0.10923573620223533,
"learning_rate": 1.915080447125704e-05,
"loss": 0.4928,
"step": 1086
},
{
"epoch": 0.5373872203683104,
"grad_norm": 0.11645005995037606,
"learning_rate": 1.9149233482209528e-05,
"loss": 0.5329,
"step": 1087
},
{
"epoch": 0.5378815968359906,
"grad_norm": 0.11351999742808684,
"learning_rate": 1.9147661105918597e-05,
"loss": 0.495,
"step": 1088
},
{
"epoch": 0.5383759733036707,
"grad_norm": 0.11922505318449131,
"learning_rate": 1.9146087342622666e-05,
"loss": 0.528,
"step": 1089
},
{
"epoch": 0.5388703497713508,
"grad_norm": 0.10979842157532074,
"learning_rate": 1.914451219256035e-05,
"loss": 0.5176,
"step": 1090
},
{
"epoch": 0.539364726239031,
"grad_norm": 0.13599873830258313,
"learning_rate": 1.914293565597048e-05,
"loss": 0.5484,
"step": 1091
},
{
"epoch": 0.5398591027067111,
"grad_norm": 0.11386557310384245,
"learning_rate": 1.9141357733092103e-05,
"loss": 0.5137,
"step": 1092
},
{
"epoch": 0.5403534791743912,
"grad_norm": 0.11803676746358281,
"learning_rate": 1.913977842416446e-05,
"loss": 0.5116,
"step": 1093
},
{
"epoch": 0.5408478556420714,
"grad_norm": 0.11868732411457808,
"learning_rate": 1.913819772942702e-05,
"loss": 0.5342,
"step": 1094
},
{
"epoch": 0.5413422321097516,
"grad_norm": 0.11407787990685536,
"learning_rate": 1.9136615649119457e-05,
"loss": 0.4862,
"step": 1095
},
{
"epoch": 0.5418366085774318,
"grad_norm": 0.1121006086440415,
"learning_rate": 1.913503218348164e-05,
"loss": 0.5262,
"step": 1096
},
{
"epoch": 0.5423309850451119,
"grad_norm": 0.11619436029541114,
"learning_rate": 1.913344733275367e-05,
"loss": 0.5115,
"step": 1097
},
{
"epoch": 0.542825361512792,
"grad_norm": 0.12438356788902903,
"learning_rate": 1.9131861097175847e-05,
"loss": 0.5152,
"step": 1098
},
{
"epoch": 0.5433197379804722,
"grad_norm": 0.11497523383395566,
"learning_rate": 1.9130273476988676e-05,
"loss": 0.5029,
"step": 1099
},
{
"epoch": 0.5438141144481523,
"grad_norm": 0.11444576937482841,
"learning_rate": 1.912868447243289e-05,
"loss": 0.5226,
"step": 1100
},
{
"epoch": 0.5443084909158324,
"grad_norm": 0.12298651618663337,
"learning_rate": 1.912709408374941e-05,
"loss": 0.5112,
"step": 1101
},
{
"epoch": 0.5448028673835126,
"grad_norm": 0.11232764866733445,
"learning_rate": 1.9125502311179383e-05,
"loss": 0.4918,
"step": 1102
},
{
"epoch": 0.5452972438511927,
"grad_norm": 0.11579007055761932,
"learning_rate": 1.9123909154964156e-05,
"loss": 0.5069,
"step": 1103
},
{
"epoch": 0.5457916203188728,
"grad_norm": 0.12309892927761722,
"learning_rate": 1.9122314615345292e-05,
"loss": 0.5208,
"step": 1104
},
{
"epoch": 0.546285996786553,
"grad_norm": 0.12290868672753008,
"learning_rate": 1.912071869256456e-05,
"loss": 0.5326,
"step": 1105
},
{
"epoch": 0.5467803732542331,
"grad_norm": 0.1207416644689569,
"learning_rate": 1.911912138686394e-05,
"loss": 0.5098,
"step": 1106
},
{
"epoch": 0.5472747497219133,
"grad_norm": 0.11782955263016227,
"learning_rate": 1.911752269848563e-05,
"loss": 0.5123,
"step": 1107
},
{
"epoch": 0.5477691261895934,
"grad_norm": 0.11893060619900989,
"learning_rate": 1.9115922627672015e-05,
"loss": 0.5488,
"step": 1108
},
{
"epoch": 0.5482635026572735,
"grad_norm": 0.11509017582647214,
"learning_rate": 1.9114321174665717e-05,
"loss": 0.479,
"step": 1109
},
{
"epoch": 0.5487578791249537,
"grad_norm": 0.11807304683833675,
"learning_rate": 1.9112718339709546e-05,
"loss": 0.5299,
"step": 1110
},
{
"epoch": 0.5492522555926338,
"grad_norm": 0.10792586076307291,
"learning_rate": 1.9111114123046537e-05,
"loss": 0.5368,
"step": 1111
},
{
"epoch": 0.5497466320603139,
"grad_norm": 0.10243561080766322,
"learning_rate": 1.9109508524919923e-05,
"loss": 0.4817,
"step": 1112
},
{
"epoch": 0.5502410085279941,
"grad_norm": 0.25403931104489275,
"learning_rate": 1.9107901545573152e-05,
"loss": 0.5284,
"step": 1113
},
{
"epoch": 0.5507353849956742,
"grad_norm": 0.11121041603185804,
"learning_rate": 1.910629318524988e-05,
"loss": 0.5079,
"step": 1114
},
{
"epoch": 0.5512297614633543,
"grad_norm": 0.12126579089459825,
"learning_rate": 1.9104683444193978e-05,
"loss": 0.532,
"step": 1115
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.1131182988435925,
"learning_rate": 1.9103072322649514e-05,
"loss": 0.5152,
"step": 1116
},
{
"epoch": 0.5522185143987146,
"grad_norm": 0.23186244990583627,
"learning_rate": 1.910145982086078e-05,
"loss": 0.5453,
"step": 1117
},
{
"epoch": 0.5527128908663947,
"grad_norm": 0.11979744096633077,
"learning_rate": 1.9099845939072265e-05,
"loss": 0.5282,
"step": 1118
},
{
"epoch": 0.5532072673340749,
"grad_norm": 0.1099713885836756,
"learning_rate": 1.9098230677528673e-05,
"loss": 0.496,
"step": 1119
},
{
"epoch": 0.553701643801755,
"grad_norm": 0.12776561464984335,
"learning_rate": 1.909661403647492e-05,
"loss": 0.4892,
"step": 1120
},
{
"epoch": 0.5541960202694352,
"grad_norm": 0.11888063417789058,
"learning_rate": 1.909499601615612e-05,
"loss": 0.4933,
"step": 1121
},
{
"epoch": 0.5546903967371153,
"grad_norm": 0.1426398274868042,
"learning_rate": 1.9093376616817612e-05,
"loss": 0.4846,
"step": 1122
},
{
"epoch": 0.5551847732047954,
"grad_norm": 0.11880654567407789,
"learning_rate": 1.9091755838704932e-05,
"loss": 0.5192,
"step": 1123
},
{
"epoch": 0.5556791496724756,
"grad_norm": 0.1161195791376653,
"learning_rate": 1.9090133682063827e-05,
"loss": 0.4858,
"step": 1124
},
{
"epoch": 0.5561735261401557,
"grad_norm": 0.1467519933394219,
"learning_rate": 1.9088510147140258e-05,
"loss": 0.5126,
"step": 1125
},
{
"epoch": 0.5566679026078358,
"grad_norm": 0.11389708802166322,
"learning_rate": 1.908688523418039e-05,
"loss": 0.5029,
"step": 1126
},
{
"epoch": 0.557162279075516,
"grad_norm": 0.12845993080782175,
"learning_rate": 1.9085258943430603e-05,
"loss": 0.5096,
"step": 1127
},
{
"epoch": 0.5576566555431961,
"grad_norm": 0.11436177960002152,
"learning_rate": 1.9083631275137473e-05,
"loss": 0.5164,
"step": 1128
},
{
"epoch": 0.5581510320108762,
"grad_norm": 0.11177055796646765,
"learning_rate": 1.9082002229547806e-05,
"loss": 0.4966,
"step": 1129
},
{
"epoch": 0.5586454084785564,
"grad_norm": 0.12094739729608651,
"learning_rate": 1.9080371806908592e-05,
"loss": 0.5006,
"step": 1130
},
{
"epoch": 0.5591397849462365,
"grad_norm": 0.11649412142861489,
"learning_rate": 1.9078740007467046e-05,
"loss": 0.5037,
"step": 1131
},
{
"epoch": 0.5596341614139168,
"grad_norm": 0.12479992470648767,
"learning_rate": 1.9077106831470594e-05,
"loss": 0.5036,
"step": 1132
},
{
"epoch": 0.5601285378815969,
"grad_norm": 0.11268535661143626,
"learning_rate": 1.9075472279166858e-05,
"loss": 0.5783,
"step": 1133
},
{
"epoch": 0.560622914349277,
"grad_norm": 0.12096469412197454,
"learning_rate": 1.9073836350803678e-05,
"loss": 0.5172,
"step": 1134
},
{
"epoch": 0.5611172908169572,
"grad_norm": 0.11913864451512181,
"learning_rate": 1.90721990466291e-05,
"loss": 0.5037,
"step": 1135
},
{
"epoch": 0.5616116672846373,
"grad_norm": 0.13731879398598928,
"learning_rate": 1.907056036689138e-05,
"loss": 0.5245,
"step": 1136
},
{
"epoch": 0.5621060437523174,
"grad_norm": 0.12198735877308306,
"learning_rate": 1.9068920311838975e-05,
"loss": 0.5449,
"step": 1137
},
{
"epoch": 0.5626004202199976,
"grad_norm": 0.11980145912137934,
"learning_rate": 1.9067278881720565e-05,
"loss": 0.4994,
"step": 1138
},
{
"epoch": 0.5630947966876777,
"grad_norm": 0.11726401065765808,
"learning_rate": 1.9065636076785025e-05,
"loss": 0.5333,
"step": 1139
},
{
"epoch": 0.5635891731553578,
"grad_norm": 0.2092797715850389,
"learning_rate": 1.9063991897281443e-05,
"loss": 0.5156,
"step": 1140
},
{
"epoch": 0.564083549623038,
"grad_norm": 0.11807286798885562,
"learning_rate": 1.9062346343459122e-05,
"loss": 0.5452,
"step": 1141
},
{
"epoch": 0.5645779260907181,
"grad_norm": 0.11502830441677443,
"learning_rate": 1.9060699415567563e-05,
"loss": 0.4936,
"step": 1142
},
{
"epoch": 0.5650723025583982,
"grad_norm": 0.11294647495960879,
"learning_rate": 1.9059051113856476e-05,
"loss": 0.5064,
"step": 1143
},
{
"epoch": 0.5655666790260784,
"grad_norm": 0.11102539175675492,
"learning_rate": 1.9057401438575792e-05,
"loss": 0.4872,
"step": 1144
},
{
"epoch": 0.5660610554937585,
"grad_norm": 0.1206040136954458,
"learning_rate": 1.9055750389975634e-05,
"loss": 0.5111,
"step": 1145
},
{
"epoch": 0.5665554319614386,
"grad_norm": 0.11877410196393516,
"learning_rate": 1.9054097968306347e-05,
"loss": 0.5198,
"step": 1146
},
{
"epoch": 0.5670498084291188,
"grad_norm": 0.11681886514423166,
"learning_rate": 1.905244417381847e-05,
"loss": 0.533,
"step": 1147
},
{
"epoch": 0.5675441848967989,
"grad_norm": 0.12294179369340406,
"learning_rate": 1.9050789006762766e-05,
"loss": 0.5295,
"step": 1148
},
{
"epoch": 0.568038561364479,
"grad_norm": 0.1269906445587477,
"learning_rate": 1.9049132467390186e-05,
"loss": 0.498,
"step": 1149
},
{
"epoch": 0.5685329378321592,
"grad_norm": 0.11218239706355951,
"learning_rate": 1.904747455595192e-05,
"loss": 0.4931,
"step": 1150
},
{
"epoch": 0.5690273142998393,
"grad_norm": 0.12805004178511675,
"learning_rate": 1.904581527269933e-05,
"loss": 0.5212,
"step": 1151
},
{
"epoch": 0.5695216907675195,
"grad_norm": 0.10784931372236749,
"learning_rate": 1.9044154617884013e-05,
"loss": 0.5079,
"step": 1152
},
{
"epoch": 0.5700160672351996,
"grad_norm": 0.1142635361521326,
"learning_rate": 1.9042492591757757e-05,
"loss": 0.4928,
"step": 1153
},
{
"epoch": 0.5705104437028797,
"grad_norm": 0.12102507229964252,
"learning_rate": 1.904082919457257e-05,
"loss": 0.5084,
"step": 1154
},
{
"epoch": 0.5710048201705599,
"grad_norm": 0.12077142167157179,
"learning_rate": 1.9039164426580667e-05,
"loss": 0.4901,
"step": 1155
},
{
"epoch": 0.57149919663824,
"grad_norm": 0.12875426096962694,
"learning_rate": 1.9037498288034455e-05,
"loss": 0.514,
"step": 1156
},
{
"epoch": 0.5719935731059201,
"grad_norm": 0.237888193392425,
"learning_rate": 1.9035830779186567e-05,
"loss": 0.5197,
"step": 1157
},
{
"epoch": 0.5724879495736003,
"grad_norm": 0.1197629905334355,
"learning_rate": 1.9034161900289844e-05,
"loss": 0.5007,
"step": 1158
},
{
"epoch": 0.5729823260412804,
"grad_norm": 0.12600981739456626,
"learning_rate": 1.9032491651597316e-05,
"loss": 0.5259,
"step": 1159
},
{
"epoch": 0.5734767025089605,
"grad_norm": 0.11994460356688487,
"learning_rate": 1.9030820033362238e-05,
"loss": 0.4808,
"step": 1160
},
{
"epoch": 0.5739710789766407,
"grad_norm": 0.2790145760418215,
"learning_rate": 1.902914704583807e-05,
"loss": 0.5139,
"step": 1161
},
{
"epoch": 0.5744654554443208,
"grad_norm": 0.11147560554652179,
"learning_rate": 1.9027472689278475e-05,
"loss": 0.5201,
"step": 1162
},
{
"epoch": 0.574959831912001,
"grad_norm": 0.10670809368651435,
"learning_rate": 1.902579696393733e-05,
"loss": 0.5035,
"step": 1163
},
{
"epoch": 0.5754542083796811,
"grad_norm": 0.11666273662661288,
"learning_rate": 1.9024119870068705e-05,
"loss": 0.5048,
"step": 1164
},
{
"epoch": 0.5759485848473612,
"grad_norm": 0.11091831497894705,
"learning_rate": 1.902244140792689e-05,
"loss": 0.5105,
"step": 1165
},
{
"epoch": 0.5764429613150414,
"grad_norm": 0.11875646980328271,
"learning_rate": 1.902076157776639e-05,
"loss": 0.5371,
"step": 1166
},
{
"epoch": 0.5769373377827215,
"grad_norm": 0.4115913024036671,
"learning_rate": 1.90190803798419e-05,
"loss": 0.5621,
"step": 1167
},
{
"epoch": 0.5774317142504016,
"grad_norm": 0.11979874677149438,
"learning_rate": 1.9017397814408332e-05,
"loss": 0.5263,
"step": 1168
},
{
"epoch": 0.5779260907180818,
"grad_norm": 0.11497731428576904,
"learning_rate": 1.90157138817208e-05,
"loss": 0.4905,
"step": 1169
},
{
"epoch": 0.578420467185762,
"grad_norm": 0.1090092687087333,
"learning_rate": 1.9014028582034635e-05,
"loss": 0.4908,
"step": 1170
},
{
"epoch": 0.5789148436534421,
"grad_norm": 0.12221676447876842,
"learning_rate": 1.901234191560536e-05,
"loss": 0.4962,
"step": 1171
},
{
"epoch": 0.5794092201211223,
"grad_norm": 0.1076723840264286,
"learning_rate": 1.9010653882688723e-05,
"loss": 0.4906,
"step": 1172
},
{
"epoch": 0.5799035965888024,
"grad_norm": 0.12306704726540672,
"learning_rate": 1.9008964483540662e-05,
"loss": 0.5186,
"step": 1173
},
{
"epoch": 0.5803979730564826,
"grad_norm": 2.2488700795549446,
"learning_rate": 1.900727371841734e-05,
"loss": 0.5496,
"step": 1174
},
{
"epoch": 0.5808923495241627,
"grad_norm": 0.13046700040036505,
"learning_rate": 1.900558158757511e-05,
"loss": 0.522,
"step": 1175
},
{
"epoch": 0.5813867259918428,
"grad_norm": 0.12890906613237926,
"learning_rate": 1.900388809127054e-05,
"loss": 0.5111,
"step": 1176
},
{
"epoch": 0.581881102459523,
"grad_norm": 0.12131836090659337,
"learning_rate": 1.900219322976041e-05,
"loss": 0.5039,
"step": 1177
},
{
"epoch": 0.5823754789272031,
"grad_norm": 0.11348278072822075,
"learning_rate": 1.9000497003301698e-05,
"loss": 0.5086,
"step": 1178
},
{
"epoch": 0.5828698553948832,
"grad_norm": 0.12207691597210336,
"learning_rate": 1.899879941215159e-05,
"loss": 0.5069,
"step": 1179
},
{
"epoch": 0.5833642318625634,
"grad_norm": 0.12258587249952316,
"learning_rate": 1.899710045656749e-05,
"loss": 0.5074,
"step": 1180
},
{
"epoch": 0.5838586083302435,
"grad_norm": 0.11948776261229883,
"learning_rate": 1.8995400136806993e-05,
"loss": 0.4971,
"step": 1181
},
{
"epoch": 0.5843529847979236,
"grad_norm": 0.1277728150088618,
"learning_rate": 1.8993698453127907e-05,
"loss": 0.5014,
"step": 1182
},
{
"epoch": 0.5848473612656038,
"grad_norm": 0.12547448456661583,
"learning_rate": 1.8991995405788254e-05,
"loss": 0.5194,
"step": 1183
},
{
"epoch": 0.5853417377332839,
"grad_norm": 0.1292009892470848,
"learning_rate": 1.8990290995046255e-05,
"loss": 0.5336,
"step": 1184
},
{
"epoch": 0.585836114200964,
"grad_norm": 0.12376544696782638,
"learning_rate": 1.898858522116034e-05,
"loss": 0.4916,
"step": 1185
},
{
"epoch": 0.5863304906686442,
"grad_norm": 0.11586678179493598,
"learning_rate": 1.8986878084389143e-05,
"loss": 0.5026,
"step": 1186
},
{
"epoch": 0.5868248671363243,
"grad_norm": 0.13164588089518922,
"learning_rate": 1.898516958499151e-05,
"loss": 0.5034,
"step": 1187
},
{
"epoch": 0.5873192436040044,
"grad_norm": 0.11215771483805048,
"learning_rate": 1.898345972322648e-05,
"loss": 0.4806,
"step": 1188
},
{
"epoch": 0.5878136200716846,
"grad_norm": 0.12067863950703991,
"learning_rate": 1.898174849935333e-05,
"loss": 0.5097,
"step": 1189
},
{
"epoch": 0.5883079965393647,
"grad_norm": 1.1272098958841226,
"learning_rate": 1.8980035913631503e-05,
"loss": 0.5716,
"step": 1190
},
{
"epoch": 0.5888023730070449,
"grad_norm": 0.12666851110164606,
"learning_rate": 1.8978321966320677e-05,
"loss": 0.5368,
"step": 1191
},
{
"epoch": 0.589296749474725,
"grad_norm": 0.12010168318269775,
"learning_rate": 1.8976606657680724e-05,
"loss": 0.4805,
"step": 1192
},
{
"epoch": 0.5897911259424051,
"grad_norm": 0.1194401673542767,
"learning_rate": 1.8974889987971732e-05,
"loss": 0.5292,
"step": 1193
},
{
"epoch": 0.5902855024100853,
"grad_norm": 0.1308045997515742,
"learning_rate": 1.8973171957453986e-05,
"loss": 0.5059,
"step": 1194
},
{
"epoch": 0.5907798788777654,
"grad_norm": 1.1647416551428618,
"learning_rate": 1.8971452566387972e-05,
"loss": 0.7968,
"step": 1195
},
{
"epoch": 0.5912742553454455,
"grad_norm": 0.12524474887503426,
"learning_rate": 1.8969731815034405e-05,
"loss": 0.508,
"step": 1196
},
{
"epoch": 0.5917686318131257,
"grad_norm": 0.1922543645358851,
"learning_rate": 1.8968009703654186e-05,
"loss": 0.4858,
"step": 1197
},
{
"epoch": 0.5922630082808058,
"grad_norm": 0.19349447117605875,
"learning_rate": 1.896628623250843e-05,
"loss": 0.5129,
"step": 1198
},
{
"epoch": 0.5927573847484859,
"grad_norm": 0.1806833145752208,
"learning_rate": 1.896456140185845e-05,
"loss": 0.5056,
"step": 1199
},
{
"epoch": 0.5932517612161661,
"grad_norm": 0.3526661673268218,
"learning_rate": 1.896283521196578e-05,
"loss": 0.5549,
"step": 1200
},
{
"epoch": 0.5937461376838462,
"grad_norm": 0.1263425063762779,
"learning_rate": 1.896110766309215e-05,
"loss": 0.5148,
"step": 1201
},
{
"epoch": 0.5942405141515263,
"grad_norm": 0.16476375453662173,
"learning_rate": 1.8959378755499497e-05,
"loss": 0.5443,
"step": 1202
},
{
"epoch": 0.5947348906192065,
"grad_norm": 0.14141690649424346,
"learning_rate": 1.895764848944996e-05,
"loss": 0.5346,
"step": 1203
},
{
"epoch": 0.5952292670868866,
"grad_norm": 0.2180352045095123,
"learning_rate": 1.8955916865205896e-05,
"loss": 0.5305,
"step": 1204
},
{
"epoch": 0.5957236435545668,
"grad_norm": 0.12495626800487794,
"learning_rate": 1.8954183883029858e-05,
"loss": 0.4917,
"step": 1205
},
{
"epoch": 0.5962180200222469,
"grad_norm": 0.12093543094972024,
"learning_rate": 1.8952449543184606e-05,
"loss": 0.5315,
"step": 1206
},
{
"epoch": 0.5967123964899271,
"grad_norm": 0.12401411004841363,
"learning_rate": 1.8950713845933112e-05,
"loss": 0.5022,
"step": 1207
},
{
"epoch": 0.5972067729576073,
"grad_norm": 0.20716574603293011,
"learning_rate": 1.894897679153855e-05,
"loss": 0.5587,
"step": 1208
},
{
"epoch": 0.5977011494252874,
"grad_norm": 0.12225093660227954,
"learning_rate": 1.894723838026429e-05,
"loss": 0.5094,
"step": 1209
},
{
"epoch": 0.5981955258929675,
"grad_norm": 0.11999234606736457,
"learning_rate": 1.8945498612373926e-05,
"loss": 0.5283,
"step": 1210
},
{
"epoch": 0.5986899023606477,
"grad_norm": 0.12570510620574052,
"learning_rate": 1.8943757488131242e-05,
"loss": 0.5011,
"step": 1211
},
{
"epoch": 0.5991842788283278,
"grad_norm": 0.11519771585690748,
"learning_rate": 1.8942015007800242e-05,
"loss": 0.4706,
"step": 1212
},
{
"epoch": 0.599678655296008,
"grad_norm": 0.12245759302122998,
"learning_rate": 1.894027117164512e-05,
"loss": 0.499,
"step": 1213
},
{
"epoch": 0.6001730317636881,
"grad_norm": 0.12013483525073018,
"learning_rate": 1.893852597993029e-05,
"loss": 0.5236,
"step": 1214
},
{
"epoch": 0.6006674082313682,
"grad_norm": 0.12319620436523498,
"learning_rate": 1.893677943292036e-05,
"loss": 0.513,
"step": 1215
},
{
"epoch": 0.6011617846990484,
"grad_norm": 0.13028914290217244,
"learning_rate": 1.893503153088015e-05,
"loss": 0.5273,
"step": 1216
},
{
"epoch": 0.6016561611667285,
"grad_norm": 0.12296725993772517,
"learning_rate": 1.8933282274074682e-05,
"loss": 0.53,
"step": 1217
},
{
"epoch": 0.6021505376344086,
"grad_norm": 0.1187243749838658,
"learning_rate": 1.8931531662769188e-05,
"loss": 0.5264,
"step": 1218
},
{
"epoch": 0.6026449141020888,
"grad_norm": 0.14610940075507256,
"learning_rate": 1.8929779697229108e-05,
"loss": 0.5127,
"step": 1219
},
{
"epoch": 0.6031392905697689,
"grad_norm": 0.11651065452407182,
"learning_rate": 1.892802637772007e-05,
"loss": 0.4888,
"step": 1220
},
{
"epoch": 0.603633667037449,
"grad_norm": 0.1240772231006754,
"learning_rate": 1.8926271704507927e-05,
"loss": 0.5064,
"step": 1221
},
{
"epoch": 0.6041280435051292,
"grad_norm": 0.5995777166903399,
"learning_rate": 1.892451567785873e-05,
"loss": 0.5357,
"step": 1222
},
{
"epoch": 0.6046224199728093,
"grad_norm": 0.12531374792625322,
"learning_rate": 1.892275829803873e-05,
"loss": 0.4882,
"step": 1223
},
{
"epoch": 0.6051167964404894,
"grad_norm": 0.1325348267891098,
"learning_rate": 1.8920999565314395e-05,
"loss": 0.5209,
"step": 1224
},
{
"epoch": 0.6056111729081696,
"grad_norm": 0.13323487957362878,
"learning_rate": 1.891923947995238e-05,
"loss": 0.5325,
"step": 1225
},
{
"epoch": 0.6061055493758497,
"grad_norm": 0.12621765841155688,
"learning_rate": 1.891747804221957e-05,
"loss": 0.504,
"step": 1226
},
{
"epoch": 0.6065999258435298,
"grad_norm": 0.13861167851261585,
"learning_rate": 1.8915715252383035e-05,
"loss": 0.5034,
"step": 1227
},
{
"epoch": 0.60709430231121,
"grad_norm": 0.12357691750370768,
"learning_rate": 1.891395111071005e-05,
"loss": 0.5215,
"step": 1228
},
{
"epoch": 0.6075886787788901,
"grad_norm": 0.11888449120283463,
"learning_rate": 1.891218561746811e-05,
"loss": 0.4759,
"step": 1229
},
{
"epoch": 0.6080830552465702,
"grad_norm": 0.24072514669276893,
"learning_rate": 1.8910418772924903e-05,
"loss": 0.4635,
"step": 1230
},
{
"epoch": 0.6085774317142504,
"grad_norm": 0.1254279185525524,
"learning_rate": 1.8908650577348323e-05,
"loss": 0.5341,
"step": 1231
},
{
"epoch": 0.6090718081819305,
"grad_norm": 0.12923424651658855,
"learning_rate": 1.8906881031006476e-05,
"loss": 0.5303,
"step": 1232
},
{
"epoch": 0.6095661846496107,
"grad_norm": 0.11874866984660053,
"learning_rate": 1.890511013416766e-05,
"loss": 0.5183,
"step": 1233
},
{
"epoch": 0.6100605611172908,
"grad_norm": 0.12827895620518567,
"learning_rate": 1.8903337887100398e-05,
"loss": 0.5051,
"step": 1234
},
{
"epoch": 0.6105549375849709,
"grad_norm": 0.15537297430897645,
"learning_rate": 1.8901564290073392e-05,
"loss": 0.5247,
"step": 1235
},
{
"epoch": 0.6110493140526511,
"grad_norm": 0.12534044620683238,
"learning_rate": 1.8899789343355567e-05,
"loss": 0.4943,
"step": 1236
},
{
"epoch": 0.6115436905203312,
"grad_norm": 0.1303733338917178,
"learning_rate": 1.889801304721605e-05,
"loss": 0.4874,
"step": 1237
},
{
"epoch": 0.6120380669880113,
"grad_norm": 0.11933796613588935,
"learning_rate": 1.8896235401924167e-05,
"loss": 0.4961,
"step": 1238
},
{
"epoch": 0.6125324434556915,
"grad_norm": 0.12061855271714765,
"learning_rate": 1.889445640774945e-05,
"loss": 0.5225,
"step": 1239
},
{
"epoch": 0.6130268199233716,
"grad_norm": 0.21738742455747687,
"learning_rate": 1.889267606496164e-05,
"loss": 0.5289,
"step": 1240
},
{
"epoch": 0.6135211963910517,
"grad_norm": 0.11489738573545522,
"learning_rate": 1.8890894373830682e-05,
"loss": 0.481,
"step": 1241
},
{
"epoch": 0.6140155728587319,
"grad_norm": 0.11940258234364634,
"learning_rate": 1.888911133462672e-05,
"loss": 0.4965,
"step": 1242
},
{
"epoch": 0.614509949326412,
"grad_norm": 0.1217245774046428,
"learning_rate": 1.8887326947620108e-05,
"loss": 0.488,
"step": 1243
},
{
"epoch": 0.6150043257940923,
"grad_norm": 0.11373351016181124,
"learning_rate": 1.8885541213081397e-05,
"loss": 0.5066,
"step": 1244
},
{
"epoch": 0.6154987022617724,
"grad_norm": 0.11025838327244784,
"learning_rate": 1.8883754131281353e-05,
"loss": 0.4719,
"step": 1245
},
{
"epoch": 0.6159930787294525,
"grad_norm": 0.10830286005910694,
"learning_rate": 1.8881965702490936e-05,
"loss": 0.4995,
"step": 1246
},
{
"epoch": 0.6164874551971327,
"grad_norm": 0.11315122120638849,
"learning_rate": 1.888017592698132e-05,
"loss": 0.5524,
"step": 1247
},
{
"epoch": 0.6169818316648128,
"grad_norm": 0.11198987620713033,
"learning_rate": 1.887838480502387e-05,
"loss": 0.5015,
"step": 1248
},
{
"epoch": 0.6174762081324929,
"grad_norm": 0.11144622367223964,
"learning_rate": 1.8876592336890166e-05,
"loss": 0.4815,
"step": 1249
},
{
"epoch": 0.6179705846001731,
"grad_norm": 0.12743452930310786,
"learning_rate": 1.8874798522851994e-05,
"loss": 0.5079,
"step": 1250
},
{
"epoch": 0.6184649610678532,
"grad_norm": 0.11087060247957652,
"learning_rate": 1.8873003363181336e-05,
"loss": 0.5356,
"step": 1251
},
{
"epoch": 0.6189593375355333,
"grad_norm": 0.12178872599042004,
"learning_rate": 1.8871206858150383e-05,
"loss": 0.5303,
"step": 1252
},
{
"epoch": 0.6194537140032135,
"grad_norm": 0.12469643233236438,
"learning_rate": 1.8869409008031523e-05,
"loss": 0.5566,
"step": 1253
},
{
"epoch": 0.6199480904708936,
"grad_norm": 0.11351014625279408,
"learning_rate": 1.8867609813097355e-05,
"loss": 0.5114,
"step": 1254
},
{
"epoch": 0.6204424669385737,
"grad_norm": 0.11868634561809649,
"learning_rate": 1.8865809273620688e-05,
"loss": 0.5012,
"step": 1255
},
{
"epoch": 0.6209368434062539,
"grad_norm": 0.1161875627354053,
"learning_rate": 1.886400738987452e-05,
"loss": 0.5199,
"step": 1256
},
{
"epoch": 0.621431219873934,
"grad_norm": 0.11391716121477169,
"learning_rate": 1.8862204162132055e-05,
"loss": 0.4721,
"step": 1257
},
{
"epoch": 0.6219255963416142,
"grad_norm": 0.11189454088136999,
"learning_rate": 1.8860399590666717e-05,
"loss": 0.4772,
"step": 1258
},
{
"epoch": 0.6224199728092943,
"grad_norm": 0.12358241614205094,
"learning_rate": 1.8858593675752115e-05,
"loss": 0.5451,
"step": 1259
},
{
"epoch": 0.6229143492769744,
"grad_norm": 0.1335177704839934,
"learning_rate": 1.885678641766207e-05,
"loss": 0.5279,
"step": 1260
},
{
"epoch": 0.6234087257446546,
"grad_norm": 5.090523094250374,
"learning_rate": 1.885497781667061e-05,
"loss": 0.6092,
"step": 1261
},
{
"epoch": 0.6239031022123347,
"grad_norm": 0.17287096856473075,
"learning_rate": 1.8853167873051954e-05,
"loss": 0.5231,
"step": 1262
},
{
"epoch": 0.6243974786800148,
"grad_norm": 0.1140173952947101,
"learning_rate": 1.885135658708054e-05,
"loss": 0.5397,
"step": 1263
},
{
"epoch": 0.624891855147695,
"grad_norm": 0.1522446076129913,
"learning_rate": 1.8849543959031002e-05,
"loss": 0.5244,
"step": 1264
},
{
"epoch": 0.6253862316153751,
"grad_norm": 0.5071591674479499,
"learning_rate": 1.8847729989178173e-05,
"loss": 0.5562,
"step": 1265
},
{
"epoch": 0.6258806080830552,
"grad_norm": 0.14405911883648215,
"learning_rate": 1.88459146777971e-05,
"loss": 0.5186,
"step": 1266
},
{
"epoch": 0.6263749845507354,
"grad_norm": 0.12998144044104795,
"learning_rate": 1.8844098025163024e-05,
"loss": 0.487,
"step": 1267
},
{
"epoch": 0.6268693610184155,
"grad_norm": 0.13609732682848494,
"learning_rate": 1.8842280031551394e-05,
"loss": 0.5214,
"step": 1268
},
{
"epoch": 0.6273637374860956,
"grad_norm": 0.13763012337768976,
"learning_rate": 1.884046069723786e-05,
"loss": 0.5367,
"step": 1269
},
{
"epoch": 0.6278581139537758,
"grad_norm": 0.12775725777775476,
"learning_rate": 1.883864002249828e-05,
"loss": 0.4869,
"step": 1270
},
{
"epoch": 0.6283524904214559,
"grad_norm": 0.138916606918435,
"learning_rate": 1.8836818007608716e-05,
"loss": 0.5224,
"step": 1271
},
{
"epoch": 0.628846866889136,
"grad_norm": 0.11813906542935543,
"learning_rate": 1.8834994652845418e-05,
"loss": 0.5249,
"step": 1272
},
{
"epoch": 0.6293412433568162,
"grad_norm": 0.13791258222978908,
"learning_rate": 1.8833169958484858e-05,
"loss": 0.5504,
"step": 1273
},
{
"epoch": 0.6298356198244963,
"grad_norm": 0.13346284045719614,
"learning_rate": 1.88313439248037e-05,
"loss": 0.5333,
"step": 1274
},
{
"epoch": 0.6303299962921765,
"grad_norm": 0.11600985870183438,
"learning_rate": 1.8829516552078816e-05,
"loss": 0.5063,
"step": 1275
},
{
"epoch": 0.6308243727598566,
"grad_norm": 0.12788687220198527,
"learning_rate": 1.8827687840587284e-05,
"loss": 0.5116,
"step": 1276
},
{
"epoch": 0.6313187492275367,
"grad_norm": 0.11437239923010335,
"learning_rate": 1.882585779060637e-05,
"loss": 0.5129,
"step": 1277
},
{
"epoch": 0.6318131256952169,
"grad_norm": 0.12218827491497228,
"learning_rate": 1.8824026402413565e-05,
"loss": 0.5741,
"step": 1278
},
{
"epoch": 0.632307502162897,
"grad_norm": 0.11574945795287242,
"learning_rate": 1.8822193676286543e-05,
"loss": 0.5128,
"step": 1279
},
{
"epoch": 0.6328018786305771,
"grad_norm": 0.11157208173110948,
"learning_rate": 1.8820359612503193e-05,
"loss": 0.4802,
"step": 1280
},
{
"epoch": 0.6332962550982574,
"grad_norm": 0.11718308942795794,
"learning_rate": 1.8818524211341603e-05,
"loss": 0.5315,
"step": 1281
},
{
"epoch": 0.6337906315659375,
"grad_norm": 0.11532724968077634,
"learning_rate": 1.8816687473080064e-05,
"loss": 0.499,
"step": 1282
},
{
"epoch": 0.6342850080336176,
"grad_norm": 0.11134841135827982,
"learning_rate": 1.881484939799707e-05,
"loss": 0.4872,
"step": 1283
},
{
"epoch": 0.6347793845012978,
"grad_norm": 0.12402247584734023,
"learning_rate": 1.8813009986371313e-05,
"loss": 0.5162,
"step": 1284
},
{
"epoch": 0.6352737609689779,
"grad_norm": 0.12494388075966506,
"learning_rate": 1.88111692384817e-05,
"loss": 0.5371,
"step": 1285
},
{
"epoch": 0.635768137436658,
"grad_norm": 0.12236103179249005,
"learning_rate": 1.880932715460732e-05,
"loss": 0.5111,
"step": 1286
},
{
"epoch": 0.6362625139043382,
"grad_norm": 0.11825326201278498,
"learning_rate": 1.8807483735027493e-05,
"loss": 0.5164,
"step": 1287
},
{
"epoch": 0.6367568903720183,
"grad_norm": 0.1161179261112344,
"learning_rate": 1.8805638980021713e-05,
"loss": 0.495,
"step": 1288
},
{
"epoch": 0.6372512668396985,
"grad_norm": 0.11598438435232514,
"learning_rate": 1.8803792889869696e-05,
"loss": 0.5227,
"step": 1289
},
{
"epoch": 0.6377456433073786,
"grad_norm": 0.11197451962736156,
"learning_rate": 1.8801945464851353e-05,
"loss": 0.5172,
"step": 1290
},
{
"epoch": 0.6382400197750587,
"grad_norm": 0.11021889665870686,
"learning_rate": 1.8800096705246793e-05,
"loss": 0.4736,
"step": 1291
},
{
"epoch": 0.6387343962427389,
"grad_norm": 0.11092949718627143,
"learning_rate": 1.8798246611336338e-05,
"loss": 0.5225,
"step": 1292
},
{
"epoch": 0.639228772710419,
"grad_norm": 0.11186804869621357,
"learning_rate": 1.8796395183400504e-05,
"loss": 0.5148,
"step": 1293
},
{
"epoch": 0.6397231491780991,
"grad_norm": 0.5683064883783551,
"learning_rate": 1.879454242172001e-05,
"loss": 0.562,
"step": 1294
},
{
"epoch": 0.6402175256457793,
"grad_norm": 0.11735671065271452,
"learning_rate": 1.8792688326575783e-05,
"loss": 0.5306,
"step": 1295
},
{
"epoch": 0.6407119021134594,
"grad_norm": 0.1254550988488079,
"learning_rate": 1.8790832898248947e-05,
"loss": 0.537,
"step": 1296
},
{
"epoch": 0.6412062785811395,
"grad_norm": 0.12040963372665399,
"learning_rate": 1.878897613702083e-05,
"loss": 0.4726,
"step": 1297
},
{
"epoch": 0.6417006550488197,
"grad_norm": 0.13403598497021058,
"learning_rate": 1.8787118043172962e-05,
"loss": 0.513,
"step": 1298
},
{
"epoch": 0.6421950315164998,
"grad_norm": 0.13124961742510283,
"learning_rate": 1.878525861698707e-05,
"loss": 0.534,
"step": 1299
},
{
"epoch": 0.64268940798418,
"grad_norm": 0.14280597196903833,
"learning_rate": 1.878339785874509e-05,
"loss": 0.522,
"step": 1300
},
{
"epoch": 0.6431837844518601,
"grad_norm": 0.9652394221425379,
"learning_rate": 1.878153576872916e-05,
"loss": 0.4775,
"step": 1301
},
{
"epoch": 0.6436781609195402,
"grad_norm": 0.16041639209172548,
"learning_rate": 1.8779672347221617e-05,
"loss": 0.5094,
"step": 1302
},
{
"epoch": 0.6441725373872204,
"grad_norm": 0.13168698663550962,
"learning_rate": 1.8777807594505e-05,
"loss": 0.4945,
"step": 1303
},
{
"epoch": 0.6446669138549005,
"grad_norm": 0.11519376886635173,
"learning_rate": 1.8775941510862047e-05,
"loss": 0.4674,
"step": 1304
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.26070085393768383,
"learning_rate": 1.877407409657571e-05,
"loss": 0.5235,
"step": 1305
},
{
"epoch": 0.6456556667902608,
"grad_norm": 1.209785670122721,
"learning_rate": 1.877220535192912e-05,
"loss": 0.5601,
"step": 1306
},
{
"epoch": 0.6461500432579409,
"grad_norm": 0.18503617107767706,
"learning_rate": 1.8770335277205638e-05,
"loss": 0.5346,
"step": 1307
},
{
"epoch": 0.646644419725621,
"grad_norm": 0.16343658834008698,
"learning_rate": 1.8768463872688803e-05,
"loss": 0.502,
"step": 1308
},
{
"epoch": 0.6471387961933012,
"grad_norm": 0.15681611538153967,
"learning_rate": 1.876659113866237e-05,
"loss": 0.5171,
"step": 1309
},
{
"epoch": 0.6476331726609813,
"grad_norm": 0.15269036932141064,
"learning_rate": 1.8764717075410286e-05,
"loss": 0.5221,
"step": 1310
},
{
"epoch": 0.6481275491286614,
"grad_norm": 0.17265019212319357,
"learning_rate": 1.8762841683216702e-05,
"loss": 0.4894,
"step": 1311
},
{
"epoch": 0.6486219255963416,
"grad_norm": 0.14653407236619484,
"learning_rate": 1.876096496236598e-05,
"loss": 0.4883,
"step": 1312
},
{
"epoch": 0.6491163020640217,
"grad_norm": 0.14552919556725105,
"learning_rate": 1.8759086913142672e-05,
"loss": 0.5331,
"step": 1313
},
{
"epoch": 0.6496106785317018,
"grad_norm": 0.11348650075288412,
"learning_rate": 1.8757207535831538e-05,
"loss": 0.5016,
"step": 1314
},
{
"epoch": 0.650105054999382,
"grad_norm": 0.12673722009712143,
"learning_rate": 1.875532683071753e-05,
"loss": 0.5109,
"step": 1315
},
{
"epoch": 0.6505994314670621,
"grad_norm": 0.2518384915000513,
"learning_rate": 1.8753444798085813e-05,
"loss": 0.5127,
"step": 1316
},
{
"epoch": 0.6510938079347423,
"grad_norm": 0.12399374464241876,
"learning_rate": 1.8751561438221747e-05,
"loss": 0.4815,
"step": 1317
},
{
"epoch": 0.6515881844024225,
"grad_norm": 0.1353827742959406,
"learning_rate": 1.87496767514109e-05,
"loss": 0.5021,
"step": 1318
},
{
"epoch": 0.6520825608701026,
"grad_norm": 0.35967523086265013,
"learning_rate": 1.8747790737939027e-05,
"loss": 0.5377,
"step": 1319
},
{
"epoch": 0.6525769373377828,
"grad_norm": 0.13053211082761526,
"learning_rate": 1.8745903398092096e-05,
"loss": 0.5029,
"step": 1320
},
{
"epoch": 0.6530713138054629,
"grad_norm": 0.14612253057843655,
"learning_rate": 1.8744014732156276e-05,
"loss": 0.5099,
"step": 1321
},
{
"epoch": 0.653565690273143,
"grad_norm": 0.14969954651924727,
"learning_rate": 1.8742124740417934e-05,
"loss": 0.5229,
"step": 1322
},
{
"epoch": 0.6540600667408232,
"grad_norm": 0.13874727583559354,
"learning_rate": 1.874023342316363e-05,
"loss": 0.5152,
"step": 1323
},
{
"epoch": 0.6545544432085033,
"grad_norm": 0.13470610163144578,
"learning_rate": 1.8738340780680143e-05,
"loss": 0.5429,
"step": 1324
},
{
"epoch": 0.6550488196761834,
"grad_norm": 0.13095434862944072,
"learning_rate": 1.8736446813254444e-05,
"loss": 0.5191,
"step": 1325
},
{
"epoch": 0.6555431961438636,
"grad_norm": 0.13622542583364822,
"learning_rate": 1.873455152117369e-05,
"loss": 0.5087,
"step": 1326
},
{
"epoch": 0.6560375726115437,
"grad_norm": 2.4089655054779966,
"learning_rate": 1.8732654904725268e-05,
"loss": 0.581,
"step": 1327
},
{
"epoch": 0.6565319490792239,
"grad_norm": 0.13427696895571103,
"learning_rate": 1.8730756964196743e-05,
"loss": 0.4811,
"step": 1328
},
{
"epoch": 0.657026325546904,
"grad_norm": 0.13469878220210854,
"learning_rate": 1.872885769987589e-05,
"loss": 0.5245,
"step": 1329
},
{
"epoch": 0.6575207020145841,
"grad_norm": 0.12679193443916337,
"learning_rate": 1.872695711205068e-05,
"loss": 0.5385,
"step": 1330
},
{
"epoch": 0.6580150784822643,
"grad_norm": 0.15293916516742095,
"learning_rate": 1.8725055201009295e-05,
"loss": 0.5093,
"step": 1331
},
{
"epoch": 0.6585094549499444,
"grad_norm": 0.1474836645819693,
"learning_rate": 1.8723151967040104e-05,
"loss": 0.5141,
"step": 1332
},
{
"epoch": 0.6590038314176245,
"grad_norm": 0.13964805447410933,
"learning_rate": 1.8721247410431686e-05,
"loss": 0.5705,
"step": 1333
},
{
"epoch": 0.6594982078853047,
"grad_norm": 0.2772609394518583,
"learning_rate": 1.8719341531472816e-05,
"loss": 0.5263,
"step": 1334
},
{
"epoch": 0.6599925843529848,
"grad_norm": 0.1152651646843607,
"learning_rate": 1.871743433045247e-05,
"loss": 0.4967,
"step": 1335
},
{
"epoch": 0.6604869608206649,
"grad_norm": 0.38814113806800077,
"learning_rate": 1.871552580765983e-05,
"loss": 0.5188,
"step": 1336
},
{
"epoch": 0.6609813372883451,
"grad_norm": 0.11982163887192067,
"learning_rate": 1.8713615963384267e-05,
"loss": 0.4808,
"step": 1337
},
{
"epoch": 0.6614757137560252,
"grad_norm": 0.1271544476372943,
"learning_rate": 1.8711704797915367e-05,
"loss": 0.4963,
"step": 1338
},
{
"epoch": 0.6619700902237053,
"grad_norm": 0.1634864896955938,
"learning_rate": 1.87097923115429e-05,
"loss": 0.5347,
"step": 1339
},
{
"epoch": 0.6624644666913855,
"grad_norm": 4.411396837465339,
"learning_rate": 1.870787850455685e-05,
"loss": 0.859,
"step": 1340
},
{
"epoch": 0.6629588431590656,
"grad_norm": 0.18660823182862138,
"learning_rate": 1.87059633772474e-05,
"loss": 0.4916,
"step": 1341
},
{
"epoch": 0.6634532196267457,
"grad_norm": 0.13247569018685637,
"learning_rate": 1.870404692990492e-05,
"loss": 0.4925,
"step": 1342
},
{
"epoch": 0.6639475960944259,
"grad_norm": 0.1892496227624473,
"learning_rate": 1.8702129162819998e-05,
"loss": 0.5257,
"step": 1343
},
{
"epoch": 0.664441972562106,
"grad_norm": 0.1496529773976184,
"learning_rate": 1.8700210076283406e-05,
"loss": 0.507,
"step": 1344
},
{
"epoch": 0.6649363490297862,
"grad_norm": 0.187607211420035,
"learning_rate": 1.869828967058613e-05,
"loss": 0.4928,
"step": 1345
},
{
"epoch": 0.6654307254974663,
"grad_norm": 0.17400549598674062,
"learning_rate": 1.8696367946019348e-05,
"loss": 0.5356,
"step": 1346
},
{
"epoch": 0.6659251019651464,
"grad_norm": 0.15041416391309773,
"learning_rate": 1.8694444902874437e-05,
"loss": 0.5606,
"step": 1347
},
{
"epoch": 0.6664194784328266,
"grad_norm": 0.13246522871533675,
"learning_rate": 1.8692520541442975e-05,
"loss": 0.5053,
"step": 1348
},
{
"epoch": 0.6669138549005067,
"grad_norm": 0.14316265889445648,
"learning_rate": 1.869059486201675e-05,
"loss": 0.492,
"step": 1349
},
{
"epoch": 0.6674082313681868,
"grad_norm": 0.12577951576017496,
"learning_rate": 1.868866786488773e-05,
"loss": 0.52,
"step": 1350
},
{
"epoch": 0.667902607835867,
"grad_norm": 0.1358255805858443,
"learning_rate": 1.8686739550348102e-05,
"loss": 0.4928,
"step": 1351
},
{
"epoch": 0.6683969843035471,
"grad_norm": 0.11972953827470229,
"learning_rate": 1.8684809918690246e-05,
"loss": 0.5242,
"step": 1352
},
{
"epoch": 0.6688913607712272,
"grad_norm": 0.1310527594931636,
"learning_rate": 1.8682878970206734e-05,
"loss": 0.5014,
"step": 1353
},
{
"epoch": 0.6693857372389074,
"grad_norm": 0.12464327910553967,
"learning_rate": 1.8680946705190343e-05,
"loss": 0.5283,
"step": 1354
},
{
"epoch": 0.6698801137065876,
"grad_norm": 0.1230149295214164,
"learning_rate": 1.8679013123934064e-05,
"loss": 0.5124,
"step": 1355
},
{
"epoch": 0.6703744901742678,
"grad_norm": 0.13274672842445956,
"learning_rate": 1.8677078226731056e-05,
"loss": 0.5031,
"step": 1356
},
{
"epoch": 0.6708688666419479,
"grad_norm": 0.11890321377715915,
"learning_rate": 1.8675142013874706e-05,
"loss": 0.5165,
"step": 1357
},
{
"epoch": 0.671363243109628,
"grad_norm": 0.1251110781203776,
"learning_rate": 1.8673204485658596e-05,
"loss": 0.4959,
"step": 1358
},
{
"epoch": 0.6718576195773082,
"grad_norm": 0.10814311141942899,
"learning_rate": 1.867126564237649e-05,
"loss": 0.5137,
"step": 1359
},
{
"epoch": 0.6723519960449883,
"grad_norm": 0.11894922338428737,
"learning_rate": 1.866932548432237e-05,
"loss": 0.5408,
"step": 1360
},
{
"epoch": 0.6728463725126684,
"grad_norm": 0.111153998258356,
"learning_rate": 1.8667384011790407e-05,
"loss": 0.5147,
"step": 1361
},
{
"epoch": 0.6733407489803486,
"grad_norm": 0.12437935585347111,
"learning_rate": 1.8665441225074975e-05,
"loss": 0.5187,
"step": 1362
},
{
"epoch": 0.6738351254480287,
"grad_norm": 0.11437602196005378,
"learning_rate": 1.866349712447065e-05,
"loss": 0.4792,
"step": 1363
},
{
"epoch": 0.6743295019157088,
"grad_norm": 0.11046232444959526,
"learning_rate": 1.8661551710272207e-05,
"loss": 0.5029,
"step": 1364
},
{
"epoch": 0.674823878383389,
"grad_norm": 0.1364855448777029,
"learning_rate": 1.865960498277461e-05,
"loss": 0.5358,
"step": 1365
},
{
"epoch": 0.6753182548510691,
"grad_norm": 0.12030831855267278,
"learning_rate": 1.8657656942273036e-05,
"loss": 0.5077,
"step": 1366
},
{
"epoch": 0.6758126313187492,
"grad_norm": 0.11015779359112779,
"learning_rate": 1.865570758906285e-05,
"loss": 0.4787,
"step": 1367
},
{
"epoch": 0.6763070077864294,
"grad_norm": 0.11851250375799721,
"learning_rate": 1.8653756923439623e-05,
"loss": 0.5279,
"step": 1368
},
{
"epoch": 0.6768013842541095,
"grad_norm": 0.11564809605606637,
"learning_rate": 1.865180494569912e-05,
"loss": 0.4979,
"step": 1369
},
{
"epoch": 0.6772957607217897,
"grad_norm": 0.10684937155653837,
"learning_rate": 1.8649851656137313e-05,
"loss": 0.4785,
"step": 1370
},
{
"epoch": 0.6777901371894698,
"grad_norm": 0.11420989758423922,
"learning_rate": 1.8647897055050362e-05,
"loss": 0.5078,
"step": 1371
},
{
"epoch": 0.6782845136571499,
"grad_norm": 0.11156651962150763,
"learning_rate": 1.8645941142734636e-05,
"loss": 0.4789,
"step": 1372
},
{
"epoch": 0.6787788901248301,
"grad_norm": 0.1116970121855764,
"learning_rate": 1.8643983919486695e-05,
"loss": 0.5166,
"step": 1373
},
{
"epoch": 0.6792732665925102,
"grad_norm": 0.10436500536662897,
"learning_rate": 1.8642025385603303e-05,
"loss": 0.4605,
"step": 1374
},
{
"epoch": 0.6797676430601903,
"grad_norm": 0.1151034569709627,
"learning_rate": 1.864006554138142e-05,
"loss": 0.5109,
"step": 1375
},
{
"epoch": 0.6802620195278705,
"grad_norm": 0.10947197373732502,
"learning_rate": 1.863810438711821e-05,
"loss": 0.5112,
"step": 1376
},
{
"epoch": 0.6807563959955506,
"grad_norm": 0.11167558274019695,
"learning_rate": 1.863614192311102e-05,
"loss": 0.4627,
"step": 1377
},
{
"epoch": 0.6812507724632307,
"grad_norm": 0.10919452388537848,
"learning_rate": 1.8634178149657415e-05,
"loss": 0.5311,
"step": 1378
},
{
"epoch": 0.6817451489309109,
"grad_norm": 0.11504119563672141,
"learning_rate": 1.863221306705515e-05,
"loss": 0.528,
"step": 1379
},
{
"epoch": 0.682239525398591,
"grad_norm": 0.12628883273503372,
"learning_rate": 1.8630246675602175e-05,
"loss": 0.5088,
"step": 1380
},
{
"epoch": 0.6827339018662711,
"grad_norm": 0.11468879146601323,
"learning_rate": 1.8628278975596644e-05,
"loss": 0.4727,
"step": 1381
},
{
"epoch": 0.6832282783339513,
"grad_norm": 0.11986700944603655,
"learning_rate": 1.862630996733691e-05,
"loss": 0.509,
"step": 1382
},
{
"epoch": 0.6837226548016314,
"grad_norm": 0.10493428812811512,
"learning_rate": 1.862433965112152e-05,
"loss": 0.5036,
"step": 1383
},
{
"epoch": 0.6842170312693115,
"grad_norm": 0.10571531309478495,
"learning_rate": 1.862236802724922e-05,
"loss": 0.4924,
"step": 1384
},
{
"epoch": 0.6847114077369917,
"grad_norm": 0.11412428829207727,
"learning_rate": 1.8620395096018955e-05,
"loss": 0.5336,
"step": 1385
},
{
"epoch": 0.6852057842046718,
"grad_norm": 0.10841020034316988,
"learning_rate": 1.861842085772987e-05,
"loss": 0.5318,
"step": 1386
},
{
"epoch": 0.685700160672352,
"grad_norm": 0.22200721033339804,
"learning_rate": 1.861644531268131e-05,
"loss": 0.5396,
"step": 1387
},
{
"epoch": 0.6861945371400321,
"grad_norm": 0.10579518205427718,
"learning_rate": 1.8614468461172813e-05,
"loss": 0.4678,
"step": 1388
},
{
"epoch": 0.6866889136077122,
"grad_norm": 0.11418755458714168,
"learning_rate": 1.861249030350411e-05,
"loss": 0.4971,
"step": 1389
},
{
"epoch": 0.6871832900753924,
"grad_norm": 0.10489618250296798,
"learning_rate": 1.8610510839975152e-05,
"loss": 0.4827,
"step": 1390
},
{
"epoch": 0.6876776665430725,
"grad_norm": 0.11035203641434968,
"learning_rate": 1.8608530070886058e-05,
"loss": 0.5129,
"step": 1391
},
{
"epoch": 0.6881720430107527,
"grad_norm": 0.11433326091620316,
"learning_rate": 1.860654799653717e-05,
"loss": 0.5236,
"step": 1392
},
{
"epoch": 0.6886664194784329,
"grad_norm": 0.11099233019562886,
"learning_rate": 1.8604564617229012e-05,
"loss": 0.5059,
"step": 1393
},
{
"epoch": 0.689160795946113,
"grad_norm": 0.1042654786853705,
"learning_rate": 1.8602579933262317e-05,
"loss": 0.4682,
"step": 1394
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.12324206835161378,
"learning_rate": 1.8600593944938006e-05,
"loss": 0.5201,
"step": 1395
},
{
"epoch": 0.6901495488814733,
"grad_norm": 0.11735206713560134,
"learning_rate": 1.8598606652557206e-05,
"loss": 0.5168,
"step": 1396
},
{
"epoch": 0.6906439253491534,
"grad_norm": 0.10768062997994363,
"learning_rate": 1.859661805642124e-05,
"loss": 0.5199,
"step": 1397
},
{
"epoch": 0.6911383018168336,
"grad_norm": 0.17457124965923057,
"learning_rate": 1.8594628156831623e-05,
"loss": 0.5103,
"step": 1398
},
{
"epoch": 0.6916326782845137,
"grad_norm": 0.1106854250030357,
"learning_rate": 1.8592636954090072e-05,
"loss": 0.4864,
"step": 1399
},
{
"epoch": 0.6921270547521938,
"grad_norm": 0.10998982798051019,
"learning_rate": 1.8590644448498502e-05,
"loss": 0.5076,
"step": 1400
},
{
"epoch": 0.692621431219874,
"grad_norm": 0.11178214666815083,
"learning_rate": 1.8588650640359023e-05,
"loss": 0.5081,
"step": 1401
},
{
"epoch": 0.6931158076875541,
"grad_norm": 0.11290097581909352,
"learning_rate": 1.858665552997395e-05,
"loss": 0.5185,
"step": 1402
},
{
"epoch": 0.6936101841552342,
"grad_norm": 0.10957430872685286,
"learning_rate": 1.858465911764578e-05,
"loss": 0.4747,
"step": 1403
},
{
"epoch": 0.6941045606229144,
"grad_norm": 0.10919124909036755,
"learning_rate": 1.8582661403677225e-05,
"loss": 0.535,
"step": 1404
},
{
"epoch": 0.6945989370905945,
"grad_norm": 0.11753701406423242,
"learning_rate": 1.8580662388371185e-05,
"loss": 0.4789,
"step": 1405
},
{
"epoch": 0.6950933135582746,
"grad_norm": 0.11969998615360558,
"learning_rate": 1.8578662072030755e-05,
"loss": 0.5149,
"step": 1406
},
{
"epoch": 0.6955876900259548,
"grad_norm": 0.11282688545710802,
"learning_rate": 1.8576660454959233e-05,
"loss": 0.5069,
"step": 1407
},
{
"epoch": 0.6960820664936349,
"grad_norm": 0.10601285741123824,
"learning_rate": 1.8574657537460114e-05,
"loss": 0.4814,
"step": 1408
},
{
"epoch": 0.696576442961315,
"grad_norm": 0.1050933660214265,
"learning_rate": 1.8572653319837087e-05,
"loss": 0.4995,
"step": 1409
},
{
"epoch": 0.6970708194289952,
"grad_norm": 0.10830744611311044,
"learning_rate": 1.857064780239404e-05,
"loss": 0.5345,
"step": 1410
},
{
"epoch": 0.6975651958966753,
"grad_norm": 0.10655284616114237,
"learning_rate": 1.8568640985435054e-05,
"loss": 0.5013,
"step": 1411
},
{
"epoch": 0.6980595723643555,
"grad_norm": 0.18006919997443957,
"learning_rate": 1.8566632869264415e-05,
"loss": 0.5357,
"step": 1412
},
{
"epoch": 0.6985539488320356,
"grad_norm": 0.10725694475249062,
"learning_rate": 1.8564623454186603e-05,
"loss": 0.5183,
"step": 1413
},
{
"epoch": 0.6990483252997157,
"grad_norm": 0.11425155262488641,
"learning_rate": 1.856261274050629e-05,
"loss": 0.5111,
"step": 1414
},
{
"epoch": 0.6995427017673959,
"grad_norm": 0.1097625331246144,
"learning_rate": 1.856060072852835e-05,
"loss": 0.503,
"step": 1415
},
{
"epoch": 0.700037078235076,
"grad_norm": 0.11615141856622023,
"learning_rate": 1.8558587418557844e-05,
"loss": 0.5168,
"step": 1416
},
{
"epoch": 0.7005314547027561,
"grad_norm": 0.11718365571908665,
"learning_rate": 1.8556572810900054e-05,
"loss": 0.4923,
"step": 1417
},
{
"epoch": 0.7010258311704363,
"grad_norm": 0.10825110519823394,
"learning_rate": 1.8554556905860432e-05,
"loss": 0.4658,
"step": 1418
},
{
"epoch": 0.7015202076381164,
"grad_norm": 0.11377891578081231,
"learning_rate": 1.855253970374464e-05,
"loss": 0.4947,
"step": 1419
},
{
"epoch": 0.7020145841057965,
"grad_norm": 0.11897993556040283,
"learning_rate": 1.8550521204858536e-05,
"loss": 0.5311,
"step": 1420
},
{
"epoch": 0.7025089605734767,
"grad_norm": 0.12476610677524289,
"learning_rate": 1.8548501409508168e-05,
"loss": 0.4809,
"step": 1421
},
{
"epoch": 0.7030033370411568,
"grad_norm": 0.112144528603003,
"learning_rate": 1.8546480317999792e-05,
"loss": 0.5056,
"step": 1422
},
{
"epoch": 0.7034977135088369,
"grad_norm": 0.11365908255376408,
"learning_rate": 1.854445793063985e-05,
"loss": 0.4785,
"step": 1423
},
{
"epoch": 0.7039920899765171,
"grad_norm": 0.11980020741352096,
"learning_rate": 1.8542434247734986e-05,
"loss": 0.4784,
"step": 1424
},
{
"epoch": 0.7044864664441972,
"grad_norm": 0.13137112295451428,
"learning_rate": 1.8540409269592038e-05,
"loss": 0.5331,
"step": 1425
},
{
"epoch": 0.7049808429118773,
"grad_norm": 0.11218317216770166,
"learning_rate": 1.853838299651804e-05,
"loss": 0.5275,
"step": 1426
},
{
"epoch": 0.7054752193795575,
"grad_norm": 0.11642269349864628,
"learning_rate": 1.8536355428820222e-05,
"loss": 0.505,
"step": 1427
},
{
"epoch": 0.7059695958472376,
"grad_norm": 0.1450296776128419,
"learning_rate": 1.8534326566806023e-05,
"loss": 0.4965,
"step": 1428
},
{
"epoch": 0.7064639723149179,
"grad_norm": 0.11882153285276555,
"learning_rate": 1.8532296410783052e-05,
"loss": 0.5053,
"step": 1429
},
{
"epoch": 0.706958348782598,
"grad_norm": 0.11066120837419129,
"learning_rate": 1.853026496105914e-05,
"loss": 0.4883,
"step": 1430
},
{
"epoch": 0.7074527252502781,
"grad_norm": 0.11706238921025472,
"learning_rate": 1.85282322179423e-05,
"loss": 0.5129,
"step": 1431
},
{
"epoch": 0.7079471017179583,
"grad_norm": 0.14975940833927873,
"learning_rate": 1.8526198181740745e-05,
"loss": 0.5048,
"step": 1432
},
{
"epoch": 0.7084414781856384,
"grad_norm": 0.11794894672439166,
"learning_rate": 1.8524162852762885e-05,
"loss": 0.5244,
"step": 1433
},
{
"epoch": 0.7089358546533185,
"grad_norm": 0.109246939577005,
"learning_rate": 1.852212623131732e-05,
"loss": 0.478,
"step": 1434
},
{
"epoch": 0.7094302311209987,
"grad_norm": 0.12393192866516871,
"learning_rate": 1.8520088317712856e-05,
"loss": 0.5141,
"step": 1435
},
{
"epoch": 0.7099246075886788,
"grad_norm": 0.1136844325919169,
"learning_rate": 1.851804911225848e-05,
"loss": 0.5006,
"step": 1436
},
{
"epoch": 0.710418984056359,
"grad_norm": 0.11562787085935147,
"learning_rate": 1.85160086152634e-05,
"loss": 0.5065,
"step": 1437
},
{
"epoch": 0.7109133605240391,
"grad_norm": 0.11162440977225564,
"learning_rate": 1.8513966827036996e-05,
"loss": 0.4857,
"step": 1438
},
{
"epoch": 0.7114077369917192,
"grad_norm": 0.1092304713584763,
"learning_rate": 1.851192374788885e-05,
"loss": 0.5001,
"step": 1439
},
{
"epoch": 0.7119021134593994,
"grad_norm": 0.10857795374254413,
"learning_rate": 1.8509879378128748e-05,
"loss": 0.493,
"step": 1440
},
{
"epoch": 0.7123964899270795,
"grad_norm": 0.1406533871253959,
"learning_rate": 1.8507833718066658e-05,
"loss": 0.5053,
"step": 1441
},
{
"epoch": 0.7128908663947596,
"grad_norm": 0.10513928526184999,
"learning_rate": 1.8505786768012756e-05,
"loss": 0.4862,
"step": 1442
},
{
"epoch": 0.7133852428624398,
"grad_norm": 0.1098562049044276,
"learning_rate": 1.850373852827741e-05,
"loss": 0.5311,
"step": 1443
},
{
"epoch": 0.7138796193301199,
"grad_norm": 0.10974737333767093,
"learning_rate": 1.8501688999171178e-05,
"loss": 0.5281,
"step": 1444
},
{
"epoch": 0.7143739957978,
"grad_norm": 0.10759962079804275,
"learning_rate": 1.849963818100482e-05,
"loss": 0.4946,
"step": 1445
},
{
"epoch": 0.7148683722654802,
"grad_norm": 0.10684636145195382,
"learning_rate": 1.849758607408929e-05,
"loss": 0.5072,
"step": 1446
},
{
"epoch": 0.7153627487331603,
"grad_norm": 0.11140148229704698,
"learning_rate": 1.8495532678735734e-05,
"loss": 0.491,
"step": 1447
},
{
"epoch": 0.7158571252008404,
"grad_norm": 0.11133439813734025,
"learning_rate": 1.84934779952555e-05,
"loss": 0.4969,
"step": 1448
},
{
"epoch": 0.7163515016685206,
"grad_norm": 0.11199781209735776,
"learning_rate": 1.8491422023960123e-05,
"loss": 0.5599,
"step": 1449
},
{
"epoch": 0.7168458781362007,
"grad_norm": 0.10474784188634474,
"learning_rate": 1.8489364765161342e-05,
"loss": 0.4981,
"step": 1450
},
{
"epoch": 0.7173402546038808,
"grad_norm": 0.10653967674619919,
"learning_rate": 1.8487306219171084e-05,
"loss": 0.4989,
"step": 1451
},
{
"epoch": 0.717834631071561,
"grad_norm": 0.11568120139311856,
"learning_rate": 1.8485246386301474e-05,
"loss": 0.539,
"step": 1452
},
{
"epoch": 0.7183290075392411,
"grad_norm": 0.11508770671438484,
"learning_rate": 1.848318526686483e-05,
"loss": 0.5111,
"step": 1453
},
{
"epoch": 0.7188233840069213,
"grad_norm": 0.11562379722189534,
"learning_rate": 1.8481122861173676e-05,
"loss": 0.5128,
"step": 1454
},
{
"epoch": 0.7193177604746014,
"grad_norm": 0.11008673407911358,
"learning_rate": 1.847905916954071e-05,
"loss": 0.4676,
"step": 1455
},
{
"epoch": 0.7198121369422815,
"grad_norm": 0.113117000407283,
"learning_rate": 1.8476994192278847e-05,
"loss": 0.5255,
"step": 1456
},
{
"epoch": 0.7203065134099617,
"grad_norm": 0.11395708101764822,
"learning_rate": 1.847492792970118e-05,
"loss": 0.4963,
"step": 1457
},
{
"epoch": 0.7208008898776418,
"grad_norm": 0.11015062305789061,
"learning_rate": 1.8472860382121012e-05,
"loss": 0.4995,
"step": 1458
},
{
"epoch": 0.7212952663453219,
"grad_norm": 0.1522601690591974,
"learning_rate": 1.8470791549851825e-05,
"loss": 0.5162,
"step": 1459
},
{
"epoch": 0.7217896428130021,
"grad_norm": 0.10960187263996567,
"learning_rate": 1.846872143320731e-05,
"loss": 0.518,
"step": 1460
},
{
"epoch": 0.7222840192806822,
"grad_norm": 0.11234567971265944,
"learning_rate": 1.8466650032501342e-05,
"loss": 0.5185,
"step": 1461
},
{
"epoch": 0.7227783957483623,
"grad_norm": 0.12730063998685015,
"learning_rate": 1.8464577348047993e-05,
"loss": 0.5129,
"step": 1462
},
{
"epoch": 0.7232727722160425,
"grad_norm": 0.10987061432282379,
"learning_rate": 1.846250338016154e-05,
"loss": 0.4951,
"step": 1463
},
{
"epoch": 0.7237671486837226,
"grad_norm": 0.11453090628652893,
"learning_rate": 1.8460428129156434e-05,
"loss": 0.516,
"step": 1464
},
{
"epoch": 0.7242615251514027,
"grad_norm": 0.31047101361778445,
"learning_rate": 1.8458351595347348e-05,
"loss": 0.4981,
"step": 1465
},
{
"epoch": 0.7247559016190829,
"grad_norm": 0.11807081883081363,
"learning_rate": 1.845627377904912e-05,
"loss": 0.5037,
"step": 1466
},
{
"epoch": 0.7252502780867631,
"grad_norm": 0.1154413830521611,
"learning_rate": 1.8454194680576808e-05,
"loss": 0.5061,
"step": 1467
},
{
"epoch": 0.7257446545544433,
"grad_norm": 0.1284439234264946,
"learning_rate": 1.845211430024565e-05,
"loss": 0.5029,
"step": 1468
},
{
"epoch": 0.7262390310221234,
"grad_norm": 0.1249134151354428,
"learning_rate": 1.8450032638371075e-05,
"loss": 0.5306,
"step": 1469
},
{
"epoch": 0.7267334074898035,
"grad_norm": 0.1091615871962084,
"learning_rate": 1.8447949695268723e-05,
"loss": 0.4604,
"step": 1470
},
{
"epoch": 0.7272277839574837,
"grad_norm": 0.1157677978541888,
"learning_rate": 1.844586547125441e-05,
"loss": 0.4806,
"step": 1471
},
{
"epoch": 0.7277221604251638,
"grad_norm": 0.1128522478510303,
"learning_rate": 1.844377996664416e-05,
"loss": 0.5006,
"step": 1472
},
{
"epoch": 0.7282165368928439,
"grad_norm": 0.1219648997312354,
"learning_rate": 1.8441693181754183e-05,
"loss": 0.491,
"step": 1473
},
{
"epoch": 0.7287109133605241,
"grad_norm": 0.12121570891386065,
"learning_rate": 1.8439605116900886e-05,
"loss": 0.5227,
"step": 1474
},
{
"epoch": 0.7292052898282042,
"grad_norm": 0.12910053040209435,
"learning_rate": 1.8437515772400866e-05,
"loss": 0.502,
"step": 1475
},
{
"epoch": 0.7296996662958843,
"grad_norm": 0.12212713268617238,
"learning_rate": 1.8435425148570925e-05,
"loss": 0.5402,
"step": 1476
},
{
"epoch": 0.7301940427635645,
"grad_norm": 0.1132314438053867,
"learning_rate": 1.8433333245728048e-05,
"loss": 0.5058,
"step": 1477
},
{
"epoch": 0.7306884192312446,
"grad_norm": 0.12018324507835344,
"learning_rate": 1.8431240064189417e-05,
"loss": 0.5149,
"step": 1478
},
{
"epoch": 0.7311827956989247,
"grad_norm": 0.11620474876160342,
"learning_rate": 1.8429145604272413e-05,
"loss": 0.4882,
"step": 1479
},
{
"epoch": 0.7316771721666049,
"grad_norm": 0.1193306073980845,
"learning_rate": 1.8427049866294594e-05,
"loss": 0.5169,
"step": 1480
},
{
"epoch": 0.732171548634285,
"grad_norm": 0.11634164837685301,
"learning_rate": 1.8424952850573744e-05,
"loss": 0.5114,
"step": 1481
},
{
"epoch": 0.7326659251019652,
"grad_norm": 0.1140100073105078,
"learning_rate": 1.8422854557427802e-05,
"loss": 0.4835,
"step": 1482
},
{
"epoch": 0.7331603015696453,
"grad_norm": 0.11621017467722944,
"learning_rate": 1.842075498717493e-05,
"loss": 0.5114,
"step": 1483
},
{
"epoch": 0.7336546780373254,
"grad_norm": 0.10907501125526764,
"learning_rate": 1.841865414013347e-05,
"loss": 0.4827,
"step": 1484
},
{
"epoch": 0.7341490545050056,
"grad_norm": 0.1130513789420502,
"learning_rate": 1.8416552016621966e-05,
"loss": 0.4909,
"step": 1485
},
{
"epoch": 0.7346434309726857,
"grad_norm": 0.10563808060016196,
"learning_rate": 1.8414448616959143e-05,
"loss": 0.4824,
"step": 1486
},
{
"epoch": 0.7351378074403658,
"grad_norm": 0.11104033373735216,
"learning_rate": 1.841234394146393e-05,
"loss": 0.5239,
"step": 1487
},
{
"epoch": 0.735632183908046,
"grad_norm": 0.11076892733906427,
"learning_rate": 1.8410237990455446e-05,
"loss": 0.5166,
"step": 1488
},
{
"epoch": 0.7361265603757261,
"grad_norm": 0.11806185460058058,
"learning_rate": 1.8408130764253003e-05,
"loss": 0.5329,
"step": 1489
},
{
"epoch": 0.7366209368434062,
"grad_norm": 4.328881388196722,
"learning_rate": 1.8406022263176108e-05,
"loss": 0.5932,
"step": 1490
},
{
"epoch": 0.7371153133110864,
"grad_norm": 0.11985647661418612,
"learning_rate": 1.8403912487544464e-05,
"loss": 0.5215,
"step": 1491
},
{
"epoch": 0.7376096897787665,
"grad_norm": 0.13214154692562366,
"learning_rate": 1.8401801437677956e-05,
"loss": 0.5191,
"step": 1492
},
{
"epoch": 0.7381040662464466,
"grad_norm": 0.11564292363782895,
"learning_rate": 1.8399689113896674e-05,
"loss": 0.5094,
"step": 1493
},
{
"epoch": 0.7385984427141268,
"grad_norm": 0.13030049877729358,
"learning_rate": 1.83975755165209e-05,
"loss": 0.5008,
"step": 1494
},
{
"epoch": 0.7390928191818069,
"grad_norm": 0.12716262267632064,
"learning_rate": 1.83954606458711e-05,
"loss": 0.5243,
"step": 1495
},
{
"epoch": 0.739587195649487,
"grad_norm": 0.14531982382052336,
"learning_rate": 1.8393344502267945e-05,
"loss": 0.5235,
"step": 1496
},
{
"epoch": 0.7400815721171672,
"grad_norm": 0.13069211053147192,
"learning_rate": 1.8391227086032288e-05,
"loss": 0.504,
"step": 1497
},
{
"epoch": 0.7405759485848473,
"grad_norm": 0.12545699518240305,
"learning_rate": 1.838910839748518e-05,
"loss": 0.4926,
"step": 1498
},
{
"epoch": 0.7410703250525275,
"grad_norm": 0.11883409431833303,
"learning_rate": 1.8386988436947874e-05,
"loss": 0.5186,
"step": 1499
},
{
"epoch": 0.7415647015202076,
"grad_norm": 0.11311392323010892,
"learning_rate": 1.83848672047418e-05,
"loss": 0.5185,
"step": 1500
},
{
"epoch": 0.7420590779878877,
"grad_norm": 0.1144931327070294,
"learning_rate": 1.8382744701188585e-05,
"loss": 0.5608,
"step": 1501
},
{
"epoch": 0.7425534544555679,
"grad_norm": 0.10955903410595784,
"learning_rate": 1.8380620926610052e-05,
"loss": 0.5212,
"step": 1502
},
{
"epoch": 0.743047830923248,
"grad_norm": 0.1253030357217558,
"learning_rate": 1.8378495881328224e-05,
"loss": 0.5509,
"step": 1503
},
{
"epoch": 0.7435422073909282,
"grad_norm": 0.11394914759300458,
"learning_rate": 1.83763695656653e-05,
"loss": 0.4999,
"step": 1504
},
{
"epoch": 0.7440365838586084,
"grad_norm": 0.200399927857364,
"learning_rate": 1.8374241979943685e-05,
"loss": 0.5009,
"step": 1505
},
{
"epoch": 0.7445309603262885,
"grad_norm": 0.1170838331795174,
"learning_rate": 1.8372113124485975e-05,
"loss": 0.5227,
"step": 1506
},
{
"epoch": 0.7450253367939687,
"grad_norm": 0.12185269256763452,
"learning_rate": 1.8369982999614944e-05,
"loss": 0.501,
"step": 1507
},
{
"epoch": 0.7455197132616488,
"grad_norm": 0.11136744067094093,
"learning_rate": 1.8367851605653585e-05,
"loss": 0.4751,
"step": 1508
},
{
"epoch": 0.7460140897293289,
"grad_norm": 0.11240164591535622,
"learning_rate": 1.8365718942925058e-05,
"loss": 0.4954,
"step": 1509
},
{
"epoch": 0.7465084661970091,
"grad_norm": 0.11608388766780468,
"learning_rate": 1.836358501175273e-05,
"loss": 0.496,
"step": 1510
},
{
"epoch": 0.7470028426646892,
"grad_norm": 0.1253675929432797,
"learning_rate": 1.8361449812460157e-05,
"loss": 0.517,
"step": 1511
},
{
"epoch": 0.7474972191323693,
"grad_norm": 0.11521216302990234,
"learning_rate": 1.8359313345371082e-05,
"loss": 0.5036,
"step": 1512
},
{
"epoch": 0.7479915956000495,
"grad_norm": 0.11774008929701936,
"learning_rate": 1.8357175610809447e-05,
"loss": 0.5075,
"step": 1513
},
{
"epoch": 0.7484859720677296,
"grad_norm": 0.10946647663480401,
"learning_rate": 1.8355036609099388e-05,
"loss": 0.5258,
"step": 1514
},
{
"epoch": 0.7489803485354097,
"grad_norm": 0.11285667461517207,
"learning_rate": 1.8352896340565223e-05,
"loss": 0.5249,
"step": 1515
},
{
"epoch": 0.7494747250030899,
"grad_norm": 0.10863153540199894,
"learning_rate": 1.8350754805531468e-05,
"loss": 0.4849,
"step": 1516
},
{
"epoch": 0.74996910147077,
"grad_norm": 0.11089565237832022,
"learning_rate": 1.834861200432284e-05,
"loss": 0.4883,
"step": 1517
},
{
"epoch": 0.7504634779384501,
"grad_norm": 0.11252669128170059,
"learning_rate": 1.834646793726423e-05,
"loss": 0.4912,
"step": 1518
},
{
"epoch": 0.7504634779384501,
"eval_loss": 0.5169246196746826,
"eval_runtime": 100.9903,
"eval_samples_per_second": 300.563,
"eval_steps_per_second": 37.578,
"step": 1518
},
{
"epoch": 0.7509578544061303,
"grad_norm": 0.10977339930323489,
"learning_rate": 1.8344322604680734e-05,
"loss": 0.4756,
"step": 1519
},
{
"epoch": 0.7514522308738104,
"grad_norm": 0.12105259966818276,
"learning_rate": 1.8342176006897633e-05,
"loss": 0.5085,
"step": 1520
},
{
"epoch": 0.7519466073414905,
"grad_norm": 0.11249697552074356,
"learning_rate": 1.8340028144240404e-05,
"loss": 0.4901,
"step": 1521
},
{
"epoch": 0.7524409838091707,
"grad_norm": 0.11920286213753867,
"learning_rate": 1.8337879017034715e-05,
"loss": 0.5291,
"step": 1522
},
{
"epoch": 0.7529353602768508,
"grad_norm": 0.1162768052571107,
"learning_rate": 1.8335728625606427e-05,
"loss": 0.4855,
"step": 1523
},
{
"epoch": 0.753429736744531,
"grad_norm": 0.1169234600590092,
"learning_rate": 1.833357697028159e-05,
"loss": 0.5278,
"step": 1524
},
{
"epoch": 0.7539241132122111,
"grad_norm": 0.11711873182865562,
"learning_rate": 1.833142405138644e-05,
"loss": 0.5256,
"step": 1525
},
{
"epoch": 0.7544184896798912,
"grad_norm": 0.11927307098482616,
"learning_rate": 1.8329269869247422e-05,
"loss": 0.5167,
"step": 1526
},
{
"epoch": 0.7549128661475714,
"grad_norm": 0.1125879361684095,
"learning_rate": 1.8327114424191153e-05,
"loss": 0.4903,
"step": 1527
},
{
"epoch": 0.7554072426152515,
"grad_norm": 0.12278630775971254,
"learning_rate": 1.832495771654446e-05,
"loss": 0.4837,
"step": 1528
},
{
"epoch": 0.7559016190829316,
"grad_norm": 0.1067242050564068,
"learning_rate": 1.832279974663434e-05,
"loss": 0.4957,
"step": 1529
},
{
"epoch": 0.7563959955506118,
"grad_norm": 0.12157265674094228,
"learning_rate": 1.8320640514788002e-05,
"loss": 0.5101,
"step": 1530
},
{
"epoch": 0.7568903720182919,
"grad_norm": 0.1087486400697486,
"learning_rate": 1.8318480021332833e-05,
"loss": 0.4955,
"step": 1531
},
{
"epoch": 0.757384748485972,
"grad_norm": 0.12104662846652355,
"learning_rate": 1.8316318266596416e-05,
"loss": 0.4991,
"step": 1532
},
{
"epoch": 0.7578791249536522,
"grad_norm": 0.11251418599147349,
"learning_rate": 1.8314155250906526e-05,
"loss": 0.505,
"step": 1533
},
{
"epoch": 0.7583735014213323,
"grad_norm": 0.1305465123465392,
"learning_rate": 1.8311990974591128e-05,
"loss": 0.4938,
"step": 1534
},
{
"epoch": 0.7588678778890124,
"grad_norm": 0.11895062913758331,
"learning_rate": 1.8309825437978376e-05,
"loss": 0.5474,
"step": 1535
},
{
"epoch": 0.7593622543566926,
"grad_norm": 0.12602419225355896,
"learning_rate": 1.830765864139662e-05,
"loss": 0.488,
"step": 1536
},
{
"epoch": 0.7598566308243727,
"grad_norm": 0.12219936777565027,
"learning_rate": 1.8305490585174398e-05,
"loss": 0.4966,
"step": 1537
},
{
"epoch": 0.7603510072920528,
"grad_norm": 0.10949234012012958,
"learning_rate": 1.8303321269640442e-05,
"loss": 0.5129,
"step": 1538
},
{
"epoch": 0.760845383759733,
"grad_norm": 0.11696878811507838,
"learning_rate": 1.8301150695123663e-05,
"loss": 0.5323,
"step": 1539
},
{
"epoch": 0.7613397602274131,
"grad_norm": 0.10562743296506082,
"learning_rate": 1.8298978861953184e-05,
"loss": 0.5015,
"step": 1540
},
{
"epoch": 0.7618341366950934,
"grad_norm": 0.11899024706787929,
"learning_rate": 1.82968057704583e-05,
"loss": 0.5739,
"step": 1541
},
{
"epoch": 0.7623285131627735,
"grad_norm": 0.10381696946180312,
"learning_rate": 1.8294631420968504e-05,
"loss": 0.4734,
"step": 1542
},
{
"epoch": 0.7628228896304536,
"grad_norm": 0.10749844630331756,
"learning_rate": 1.8292455813813482e-05,
"loss": 0.5019,
"step": 1543
},
{
"epoch": 0.7633172660981338,
"grad_norm": 0.1073814224301153,
"learning_rate": 1.829027894932311e-05,
"loss": 0.4998,
"step": 1544
},
{
"epoch": 0.7638116425658139,
"grad_norm": 0.11042840721869714,
"learning_rate": 1.8288100827827446e-05,
"loss": 0.4956,
"step": 1545
},
{
"epoch": 0.764306019033494,
"grad_norm": 0.11153046294980228,
"learning_rate": 1.8285921449656752e-05,
"loss": 0.5168,
"step": 1546
},
{
"epoch": 0.7648003955011742,
"grad_norm": 0.1091605453107047,
"learning_rate": 1.8283740815141468e-05,
"loss": 0.511,
"step": 1547
},
{
"epoch": 0.7652947719688543,
"grad_norm": 0.11182914450730866,
"learning_rate": 1.8281558924612237e-05,
"loss": 0.5118,
"step": 1548
},
{
"epoch": 0.7657891484365345,
"grad_norm": 0.1174459867125228,
"learning_rate": 1.8279375778399885e-05,
"loss": 0.5164,
"step": 1549
},
{
"epoch": 0.7662835249042146,
"grad_norm": 0.10843915289483688,
"learning_rate": 1.827719137683542e-05,
"loss": 0.4702,
"step": 1550
},
{
"epoch": 0.7667779013718947,
"grad_norm": 0.11189934739001768,
"learning_rate": 1.8275005720250066e-05,
"loss": 0.5039,
"step": 1551
},
{
"epoch": 0.7672722778395749,
"grad_norm": 0.11232575340212998,
"learning_rate": 1.827281880897521e-05,
"loss": 0.4732,
"step": 1552
},
{
"epoch": 0.767766654307255,
"grad_norm": 0.1088444534042866,
"learning_rate": 1.8270630643342438e-05,
"loss": 0.5227,
"step": 1553
},
{
"epoch": 0.7682610307749351,
"grad_norm": 0.1391837923345606,
"learning_rate": 1.8268441223683537e-05,
"loss": 0.4914,
"step": 1554
},
{
"epoch": 0.7687554072426153,
"grad_norm": 0.10761249112366496,
"learning_rate": 1.826625055033047e-05,
"loss": 0.4991,
"step": 1555
},
{
"epoch": 0.7692497837102954,
"grad_norm": 0.10296760375740688,
"learning_rate": 1.82640586236154e-05,
"loss": 0.5153,
"step": 1556
},
{
"epoch": 0.7697441601779755,
"grad_norm": 0.12230219477505408,
"learning_rate": 1.8261865443870668e-05,
"loss": 0.5154,
"step": 1557
},
{
"epoch": 0.7702385366456557,
"grad_norm": 0.11013956429341706,
"learning_rate": 1.8259671011428824e-05,
"loss": 0.4745,
"step": 1558
},
{
"epoch": 0.7707329131133358,
"grad_norm": 0.11507658304527071,
"learning_rate": 1.8257475326622587e-05,
"loss": 0.5031,
"step": 1559
},
{
"epoch": 0.7712272895810159,
"grad_norm": 0.11936241448010394,
"learning_rate": 1.825527838978488e-05,
"loss": 0.538,
"step": 1560
},
{
"epoch": 0.7717216660486961,
"grad_norm": 0.10831126046983862,
"learning_rate": 1.8253080201248806e-05,
"loss": 0.491,
"step": 1561
},
{
"epoch": 0.7722160425163762,
"grad_norm": 0.11579001798229159,
"learning_rate": 1.825088076134767e-05,
"loss": 0.4784,
"step": 1562
},
{
"epoch": 0.7727104189840563,
"grad_norm": 0.21612510669252785,
"learning_rate": 1.8248680070414956e-05,
"loss": 0.4974,
"step": 1563
},
{
"epoch": 0.7732047954517365,
"grad_norm": 0.11475656910837606,
"learning_rate": 1.8246478128784345e-05,
"loss": 0.4765,
"step": 1564
},
{
"epoch": 0.7736991719194166,
"grad_norm": 0.11867465971816395,
"learning_rate": 1.8244274936789698e-05,
"loss": 0.537,
"step": 1565
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.11185054579371267,
"learning_rate": 1.8242070494765078e-05,
"loss": 0.5031,
"step": 1566
},
{
"epoch": 0.7746879248547769,
"grad_norm": 0.11110420993053961,
"learning_rate": 1.823986480304473e-05,
"loss": 0.4979,
"step": 1567
},
{
"epoch": 0.775182301322457,
"grad_norm": 0.1255003467183431,
"learning_rate": 1.823765786196309e-05,
"loss": 0.4863,
"step": 1568
},
{
"epoch": 0.7756766777901372,
"grad_norm": 0.10894246657797643,
"learning_rate": 1.8235449671854776e-05,
"loss": 0.5083,
"step": 1569
},
{
"epoch": 0.7761710542578173,
"grad_norm": 0.1767459656950356,
"learning_rate": 1.8233240233054613e-05,
"loss": 0.5274,
"step": 1570
},
{
"epoch": 0.7766654307254974,
"grad_norm": 0.11446514296545794,
"learning_rate": 1.82310295458976e-05,
"loss": 0.521,
"step": 1571
},
{
"epoch": 0.7771598071931776,
"grad_norm": 0.8484576430954529,
"learning_rate": 1.8228817610718934e-05,
"loss": 0.5395,
"step": 1572
},
{
"epoch": 0.7776541836608577,
"grad_norm": 0.11165687551538205,
"learning_rate": 1.822660442785399e-05,
"loss": 0.492,
"step": 1573
},
{
"epoch": 0.7781485601285378,
"grad_norm": 0.12232561736617044,
"learning_rate": 1.8224389997638344e-05,
"loss": 0.5133,
"step": 1574
},
{
"epoch": 0.778642936596218,
"grad_norm": 0.12779321330463345,
"learning_rate": 1.8222174320407758e-05,
"loss": 0.4961,
"step": 1575
},
{
"epoch": 0.7791373130638981,
"grad_norm": 0.1261261131084318,
"learning_rate": 1.8219957396498183e-05,
"loss": 0.4975,
"step": 1576
},
{
"epoch": 0.7796316895315782,
"grad_norm": 0.12186108170006381,
"learning_rate": 1.8217739226245753e-05,
"loss": 0.4722,
"step": 1577
},
{
"epoch": 0.7801260659992585,
"grad_norm": 0.2136167795560751,
"learning_rate": 1.82155198099868e-05,
"loss": 0.5161,
"step": 1578
},
{
"epoch": 0.7806204424669386,
"grad_norm": 0.12561681551176695,
"learning_rate": 1.8213299148057837e-05,
"loss": 0.5044,
"step": 1579
},
{
"epoch": 0.7811148189346188,
"grad_norm": 0.11723635593751418,
"learning_rate": 1.8211077240795573e-05,
"loss": 0.5227,
"step": 1580
},
{
"epoch": 0.7816091954022989,
"grad_norm": 0.13294340687488976,
"learning_rate": 1.8208854088536903e-05,
"loss": 0.5176,
"step": 1581
},
{
"epoch": 0.782103571869979,
"grad_norm": 0.29165369920938566,
"learning_rate": 1.8206629691618904e-05,
"loss": 0.5437,
"step": 1582
},
{
"epoch": 0.7825979483376592,
"grad_norm": 0.4180917713489711,
"learning_rate": 1.8204404050378856e-05,
"loss": 0.5058,
"step": 1583
},
{
"epoch": 0.7830923248053393,
"grad_norm": 0.12965591767896903,
"learning_rate": 1.8202177165154217e-05,
"loss": 0.5174,
"step": 1584
},
{
"epoch": 0.7835867012730194,
"grad_norm": 0.13687546704344175,
"learning_rate": 1.819994903628263e-05,
"loss": 0.5044,
"step": 1585
},
{
"epoch": 0.7840810777406996,
"grad_norm": 0.11701702769601974,
"learning_rate": 1.8197719664101944e-05,
"loss": 0.4961,
"step": 1586
},
{
"epoch": 0.7845754542083797,
"grad_norm": 0.139395043509737,
"learning_rate": 1.8195489048950175e-05,
"loss": 0.4852,
"step": 1587
},
{
"epoch": 0.7850698306760598,
"grad_norm": 0.13041580613294362,
"learning_rate": 1.8193257191165544e-05,
"loss": 0.53,
"step": 1588
},
{
"epoch": 0.78556420714374,
"grad_norm": 0.1299442278582246,
"learning_rate": 1.8191024091086455e-05,
"loss": 0.5189,
"step": 1589
},
{
"epoch": 0.7860585836114201,
"grad_norm": 0.1263828921415048,
"learning_rate": 1.8188789749051494e-05,
"loss": 0.5242,
"step": 1590
},
{
"epoch": 0.7865529600791002,
"grad_norm": 0.12010959640609364,
"learning_rate": 1.8186554165399446e-05,
"loss": 0.5055,
"step": 1591
},
{
"epoch": 0.7870473365467804,
"grad_norm": 0.13113373607564094,
"learning_rate": 1.818431734046928e-05,
"loss": 0.5204,
"step": 1592
},
{
"epoch": 0.7875417130144605,
"grad_norm": 0.11923396792903965,
"learning_rate": 1.8182079274600146e-05,
"loss": 0.5085,
"step": 1593
},
{
"epoch": 0.7880360894821407,
"grad_norm": 0.11448592322756816,
"learning_rate": 1.817983996813139e-05,
"loss": 0.4716,
"step": 1594
},
{
"epoch": 0.7885304659498208,
"grad_norm": 0.12065532410174383,
"learning_rate": 1.817759942140255e-05,
"loss": 0.4966,
"step": 1595
},
{
"epoch": 0.7890248424175009,
"grad_norm": 0.12073311919360415,
"learning_rate": 1.8175357634753343e-05,
"loss": 0.5145,
"step": 1596
},
{
"epoch": 0.7895192188851811,
"grad_norm": 0.11734458849611246,
"learning_rate": 1.8173114608523674e-05,
"loss": 0.5352,
"step": 1597
},
{
"epoch": 0.7900135953528612,
"grad_norm": 0.12005602834889328,
"learning_rate": 1.8170870343053646e-05,
"loss": 0.5001,
"step": 1598
},
{
"epoch": 0.7905079718205413,
"grad_norm": 0.1155097768413269,
"learning_rate": 1.8168624838683543e-05,
"loss": 0.511,
"step": 1599
},
{
"epoch": 0.7910023482882215,
"grad_norm": 0.11579172541938927,
"learning_rate": 1.8166378095753835e-05,
"loss": 0.4939,
"step": 1600
},
{
"epoch": 0.7914967247559016,
"grad_norm": 0.15611939176854864,
"learning_rate": 1.8164130114605177e-05,
"loss": 0.5032,
"step": 1601
},
{
"epoch": 0.7919911012235817,
"grad_norm": 0.6807021862516772,
"learning_rate": 1.816188089557843e-05,
"loss": 0.5365,
"step": 1602
},
{
"epoch": 0.7924854776912619,
"grad_norm": 0.12014697969627158,
"learning_rate": 1.815963043901462e-05,
"loss": 0.5124,
"step": 1603
},
{
"epoch": 0.792979854158942,
"grad_norm": 0.11141578045201661,
"learning_rate": 1.815737874525497e-05,
"loss": 0.476,
"step": 1604
},
{
"epoch": 0.7934742306266221,
"grad_norm": 0.12453322867202286,
"learning_rate": 1.8155125814640896e-05,
"loss": 0.5076,
"step": 1605
},
{
"epoch": 0.7939686070943023,
"grad_norm": 0.11610035068406502,
"learning_rate": 1.815287164751399e-05,
"loss": 0.5308,
"step": 1606
},
{
"epoch": 0.7944629835619824,
"grad_norm": 0.11363485857998774,
"learning_rate": 1.8150616244216047e-05,
"loss": 0.4838,
"step": 1607
},
{
"epoch": 0.7949573600296626,
"grad_norm": 0.11747281553091403,
"learning_rate": 1.814835960508903e-05,
"loss": 0.5404,
"step": 1608
},
{
"epoch": 0.7954517364973427,
"grad_norm": 0.10888696757348207,
"learning_rate": 1.8146101730475107e-05,
"loss": 0.4638,
"step": 1609
},
{
"epoch": 0.7959461129650228,
"grad_norm": 0.12172120544941309,
"learning_rate": 1.814384262071662e-05,
"loss": 0.51,
"step": 1610
},
{
"epoch": 0.796440489432703,
"grad_norm": 0.11572627732594978,
"learning_rate": 1.814158227615611e-05,
"loss": 0.5032,
"step": 1611
},
{
"epoch": 0.7969348659003831,
"grad_norm": 0.11263085790287053,
"learning_rate": 1.8139320697136297e-05,
"loss": 0.5047,
"step": 1612
},
{
"epoch": 0.7974292423680632,
"grad_norm": 0.10678854975252682,
"learning_rate": 1.813705788400009e-05,
"loss": 0.516,
"step": 1613
},
{
"epoch": 0.7979236188357434,
"grad_norm": 0.13017009258632473,
"learning_rate": 1.8134793837090585e-05,
"loss": 0.503,
"step": 1614
},
{
"epoch": 0.7984179953034236,
"grad_norm": 0.25445789944996455,
"learning_rate": 1.8132528556751073e-05,
"loss": 0.5005,
"step": 1615
},
{
"epoch": 0.7989123717711037,
"grad_norm": 0.11176258859879269,
"learning_rate": 1.8130262043325015e-05,
"loss": 0.5068,
"step": 1616
},
{
"epoch": 0.7994067482387839,
"grad_norm": 0.23349638290362795,
"learning_rate": 1.812799429715607e-05,
"loss": 0.5215,
"step": 1617
},
{
"epoch": 0.799901124706464,
"grad_norm": 0.10632257286702794,
"learning_rate": 1.812572531858809e-05,
"loss": 0.4919,
"step": 1618
},
{
"epoch": 0.8003955011741442,
"grad_norm": 0.12241453072927296,
"learning_rate": 1.8123455107965104e-05,
"loss": 0.5073,
"step": 1619
},
{
"epoch": 0.8008898776418243,
"grad_norm": 0.11755659437235795,
"learning_rate": 1.8121183665631326e-05,
"loss": 0.5138,
"step": 1620
},
{
"epoch": 0.8013842541095044,
"grad_norm": 0.12404074611007797,
"learning_rate": 1.811891099193116e-05,
"loss": 0.5126,
"step": 1621
},
{
"epoch": 0.8018786305771846,
"grad_norm": 0.11563881671776396,
"learning_rate": 1.811663708720921e-05,
"loss": 0.5144,
"step": 1622
},
{
"epoch": 0.8023730070448647,
"grad_norm": 0.11928731084585482,
"learning_rate": 1.8114361951810246e-05,
"loss": 0.4697,
"step": 1623
},
{
"epoch": 0.8028673835125448,
"grad_norm": 0.1016290359535352,
"learning_rate": 1.8112085586079228e-05,
"loss": 0.4808,
"step": 1624
},
{
"epoch": 0.803361759980225,
"grad_norm": 0.12641078356832078,
"learning_rate": 1.810980799036132e-05,
"loss": 0.5137,
"step": 1625
},
{
"epoch": 0.8038561364479051,
"grad_norm": 0.1124906801997757,
"learning_rate": 1.8107529165001847e-05,
"loss": 0.536,
"step": 1626
},
{
"epoch": 0.8043505129155852,
"grad_norm": 0.11163251805146057,
"learning_rate": 1.8105249110346345e-05,
"loss": 0.4828,
"step": 1627
},
{
"epoch": 0.8048448893832654,
"grad_norm": 0.11036602984913058,
"learning_rate": 1.8102967826740517e-05,
"loss": 0.5023,
"step": 1628
},
{
"epoch": 0.8053392658509455,
"grad_norm": 0.10871014658894955,
"learning_rate": 1.8100685314530266e-05,
"loss": 0.4878,
"step": 1629
},
{
"epoch": 0.8058336423186256,
"grad_norm": 0.11237077611770631,
"learning_rate": 1.8098401574061668e-05,
"loss": 0.4972,
"step": 1630
},
{
"epoch": 0.8063280187863058,
"grad_norm": 0.10789553188312324,
"learning_rate": 1.8096116605681004e-05,
"loss": 0.4916,
"step": 1631
},
{
"epoch": 0.8068223952539859,
"grad_norm": 0.11081908141288639,
"learning_rate": 1.8093830409734717e-05,
"loss": 0.5265,
"step": 1632
},
{
"epoch": 0.807316771721666,
"grad_norm": 0.13845744873556065,
"learning_rate": 1.8091542986569465e-05,
"loss": 0.5553,
"step": 1633
},
{
"epoch": 0.8078111481893462,
"grad_norm": 0.1089770491590344,
"learning_rate": 1.8089254336532062e-05,
"loss": 0.4873,
"step": 1634
},
{
"epoch": 0.8083055246570263,
"grad_norm": 0.09824226802177065,
"learning_rate": 1.808696445996953e-05,
"loss": 0.4676,
"step": 1635
},
{
"epoch": 0.8087999011247065,
"grad_norm": 0.11096137164110392,
"learning_rate": 1.8084673357229067e-05,
"loss": 0.5267,
"step": 1636
},
{
"epoch": 0.8092942775923866,
"grad_norm": 0.10187502538358767,
"learning_rate": 1.8082381028658055e-05,
"loss": 0.5011,
"step": 1637
},
{
"epoch": 0.8097886540600667,
"grad_norm": 0.2269725989502022,
"learning_rate": 1.8080087474604074e-05,
"loss": 0.5156,
"step": 1638
},
{
"epoch": 0.8102830305277469,
"grad_norm": 0.10920630636848125,
"learning_rate": 1.807779269541488e-05,
"loss": 0.5453,
"step": 1639
},
{
"epoch": 0.810777406995427,
"grad_norm": 0.10798793844515413,
"learning_rate": 1.807549669143841e-05,
"loss": 0.5575,
"step": 1640
},
{
"epoch": 0.8112717834631071,
"grad_norm": 0.11084666151215966,
"learning_rate": 1.8073199463022804e-05,
"loss": 0.5045,
"step": 1641
},
{
"epoch": 0.8117661599307873,
"grad_norm": 0.11111501967383265,
"learning_rate": 1.8070901010516368e-05,
"loss": 0.544,
"step": 1642
},
{
"epoch": 0.8122605363984674,
"grad_norm": 0.10290105301590033,
"learning_rate": 1.8068601334267605e-05,
"loss": 0.4899,
"step": 1643
},
{
"epoch": 0.8127549128661475,
"grad_norm": 0.11182442659007466,
"learning_rate": 1.8066300434625202e-05,
"loss": 0.5384,
"step": 1644
},
{
"epoch": 0.8132492893338277,
"grad_norm": 0.10505957324415846,
"learning_rate": 1.8063998311938026e-05,
"loss": 0.4871,
"step": 1645
},
{
"epoch": 0.8137436658015078,
"grad_norm": 0.10399475412222146,
"learning_rate": 1.8061694966555145e-05,
"loss": 0.4939,
"step": 1646
},
{
"epoch": 0.814238042269188,
"grad_norm": 0.10364796101954571,
"learning_rate": 1.805939039882579e-05,
"loss": 0.4792,
"step": 1647
},
{
"epoch": 0.8147324187368681,
"grad_norm": 0.1112350804274048,
"learning_rate": 1.8057084609099397e-05,
"loss": 0.4883,
"step": 1648
},
{
"epoch": 0.8152267952045482,
"grad_norm": 0.10464217051053609,
"learning_rate": 1.8054777597725573e-05,
"loss": 0.481,
"step": 1649
},
{
"epoch": 0.8157211716722284,
"grad_norm": 0.11135874579663645,
"learning_rate": 1.8052469365054123e-05,
"loss": 0.4965,
"step": 1650
},
{
"epoch": 0.8162155481399085,
"grad_norm": 0.110735825064967,
"learning_rate": 1.8050159911435024e-05,
"loss": 0.5074,
"step": 1651
},
{
"epoch": 0.8167099246075887,
"grad_norm": 0.10898741376553198,
"learning_rate": 1.8047849237218446e-05,
"loss": 0.5047,
"step": 1652
},
{
"epoch": 0.8172043010752689,
"grad_norm": 0.11594538304911019,
"learning_rate": 1.8045537342754745e-05,
"loss": 0.5249,
"step": 1653
},
{
"epoch": 0.817698677542949,
"grad_norm": 0.11186888764774354,
"learning_rate": 1.8043224228394458e-05,
"loss": 0.4768,
"step": 1654
},
{
"epoch": 0.8181930540106291,
"grad_norm": 0.11345591021671314,
"learning_rate": 1.804090989448831e-05,
"loss": 0.5102,
"step": 1655
},
{
"epoch": 0.8186874304783093,
"grad_norm": 0.1077763010738406,
"learning_rate": 1.8038594341387208e-05,
"loss": 0.4924,
"step": 1656
},
{
"epoch": 0.8191818069459894,
"grad_norm": 0.11155496465111282,
"learning_rate": 1.8036277569442245e-05,
"loss": 0.4826,
"step": 1657
},
{
"epoch": 0.8196761834136695,
"grad_norm": 0.10916969122709735,
"learning_rate": 1.8033959579004704e-05,
"loss": 0.4943,
"step": 1658
},
{
"epoch": 0.8201705598813497,
"grad_norm": 0.11103214579271907,
"learning_rate": 1.803164037042604e-05,
"loss": 0.5189,
"step": 1659
},
{
"epoch": 0.8206649363490298,
"grad_norm": 0.11049821252153219,
"learning_rate": 1.8029319944057907e-05,
"loss": 0.4957,
"step": 1660
},
{
"epoch": 0.82115931281671,
"grad_norm": 0.11037765533860981,
"learning_rate": 1.8026998300252133e-05,
"loss": 0.5324,
"step": 1661
},
{
"epoch": 0.8216536892843901,
"grad_norm": 0.10720540588647114,
"learning_rate": 1.802467543936074e-05,
"loss": 0.5134,
"step": 1662
},
{
"epoch": 0.8221480657520702,
"grad_norm": 0.11126956876902662,
"learning_rate": 1.8022351361735925e-05,
"loss": 0.4656,
"step": 1663
},
{
"epoch": 0.8226424422197504,
"grad_norm": 0.14232436838040305,
"learning_rate": 1.8020026067730077e-05,
"loss": 0.5149,
"step": 1664
},
{
"epoch": 0.8231368186874305,
"grad_norm": 0.09963050658154443,
"learning_rate": 1.8017699557695765e-05,
"loss": 0.4894,
"step": 1665
},
{
"epoch": 0.8236311951551106,
"grad_norm": 0.1059120160894076,
"learning_rate": 1.8015371831985743e-05,
"loss": 0.4765,
"step": 1666
},
{
"epoch": 0.8241255716227908,
"grad_norm": 0.10567044573591329,
"learning_rate": 1.801304289095295e-05,
"loss": 0.4994,
"step": 1667
},
{
"epoch": 0.8246199480904709,
"grad_norm": 0.11602031793360452,
"learning_rate": 1.8010712734950515e-05,
"loss": 0.4953,
"step": 1668
},
{
"epoch": 0.825114324558151,
"grad_norm": 0.10480780919866008,
"learning_rate": 1.8008381364331737e-05,
"loss": 0.5068,
"step": 1669
},
{
"epoch": 0.8256087010258312,
"grad_norm": 0.10359031906661526,
"learning_rate": 1.8006048779450114e-05,
"loss": 0.4947,
"step": 1670
},
{
"epoch": 0.8261030774935113,
"grad_norm": 0.10274796421069078,
"learning_rate": 1.8003714980659313e-05,
"loss": 0.4861,
"step": 1671
},
{
"epoch": 0.8265974539611914,
"grad_norm": 0.1121326032584218,
"learning_rate": 1.8001379968313208e-05,
"loss": 0.5208,
"step": 1672
},
{
"epoch": 0.8270918304288716,
"grad_norm": 0.10390067297860398,
"learning_rate": 1.7999043742765833e-05,
"loss": 0.4872,
"step": 1673
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.1019286851925381,
"learning_rate": 1.799670630437142e-05,
"loss": 0.4896,
"step": 1674
},
{
"epoch": 0.8280805833642318,
"grad_norm": 0.11237938698968557,
"learning_rate": 1.7994367653484375e-05,
"loss": 0.5112,
"step": 1675
},
{
"epoch": 0.828574959831912,
"grad_norm": 0.10740353530123058,
"learning_rate": 1.79920277904593e-05,
"loss": 0.5202,
"step": 1676
},
{
"epoch": 0.8290693362995921,
"grad_norm": 0.10938456819512841,
"learning_rate": 1.7989686715650968e-05,
"loss": 0.5271,
"step": 1677
},
{
"epoch": 0.8295637127672723,
"grad_norm": 0.1184450788596645,
"learning_rate": 1.7987344429414354e-05,
"loss": 0.5097,
"step": 1678
},
{
"epoch": 0.8300580892349524,
"grad_norm": 7.206826998898181,
"learning_rate": 1.798500093210459e-05,
"loss": 0.8575,
"step": 1679
},
{
"epoch": 0.8305524657026325,
"grad_norm": 0.12421292092327496,
"learning_rate": 1.7982656224077016e-05,
"loss": 0.4894,
"step": 1680
},
{
"epoch": 0.8310468421703127,
"grad_norm": 0.7498230679805675,
"learning_rate": 1.7980310305687142e-05,
"loss": 0.6308,
"step": 1681
},
{
"epoch": 0.8315412186379928,
"grad_norm": 0.12645036256329178,
"learning_rate": 1.797796317729067e-05,
"loss": 0.5072,
"step": 1682
},
{
"epoch": 0.8320355951056729,
"grad_norm": 0.1709442938934923,
"learning_rate": 1.7975614839243476e-05,
"loss": 0.5617,
"step": 1683
},
{
"epoch": 0.8325299715733531,
"grad_norm": 0.18605873659663558,
"learning_rate": 1.7973265291901625e-05,
"loss": 0.4978,
"step": 1684
},
{
"epoch": 0.8330243480410332,
"grad_norm": 0.13993370862095234,
"learning_rate": 1.7970914535621368e-05,
"loss": 0.4742,
"step": 1685
},
{
"epoch": 0.8335187245087133,
"grad_norm": 0.13001500879401612,
"learning_rate": 1.7968562570759137e-05,
"loss": 0.5293,
"step": 1686
},
{
"epoch": 0.8340131009763935,
"grad_norm": 0.1499155863042301,
"learning_rate": 1.796620939767154e-05,
"loss": 0.481,
"step": 1687
},
{
"epoch": 0.8345074774440736,
"grad_norm": 0.3649643954294922,
"learning_rate": 1.7963855016715378e-05,
"loss": 0.505,
"step": 1688
},
{
"epoch": 0.8350018539117539,
"grad_norm": 0.11744946985852246,
"learning_rate": 1.7961499428247632e-05,
"loss": 0.4974,
"step": 1689
},
{
"epoch": 0.835496230379434,
"grad_norm": 0.12213907164416012,
"learning_rate": 1.7959142632625463e-05,
"loss": 0.5107,
"step": 1690
},
{
"epoch": 0.8359906068471141,
"grad_norm": 0.12038817758814949,
"learning_rate": 1.7956784630206225e-05,
"loss": 0.5268,
"step": 1691
},
{
"epoch": 0.8364849833147943,
"grad_norm": 0.11457939366971773,
"learning_rate": 1.795442542134744e-05,
"loss": 0.4826,
"step": 1692
},
{
"epoch": 0.8369793597824744,
"grad_norm": 0.11548661136704294,
"learning_rate": 1.7952065006406826e-05,
"loss": 0.5078,
"step": 1693
},
{
"epoch": 0.8374737362501545,
"grad_norm": 0.11332265500224674,
"learning_rate": 1.7949703385742277e-05,
"loss": 0.5068,
"step": 1694
},
{
"epoch": 0.8379681127178347,
"grad_norm": 0.11807160439892926,
"learning_rate": 1.7947340559711866e-05,
"loss": 0.4824,
"step": 1695
},
{
"epoch": 0.8384624891855148,
"grad_norm": 0.11648756144267396,
"learning_rate": 1.7944976528673862e-05,
"loss": 0.5014,
"step": 1696
},
{
"epoch": 0.8389568656531949,
"grad_norm": 0.10721073180178253,
"learning_rate": 1.7942611292986708e-05,
"loss": 0.4814,
"step": 1697
},
{
"epoch": 0.8394512421208751,
"grad_norm": 0.11745435266083615,
"learning_rate": 1.7940244853009024e-05,
"loss": 0.5082,
"step": 1698
},
{
"epoch": 0.8399456185885552,
"grad_norm": 0.11994345821127454,
"learning_rate": 1.7937877209099624e-05,
"loss": 0.4807,
"step": 1699
},
{
"epoch": 0.8404399950562353,
"grad_norm": 0.1149563601049724,
"learning_rate": 1.79355083616175e-05,
"loss": 0.5138,
"step": 1700
},
{
"epoch": 0.8409343715239155,
"grad_norm": 0.11369033507322293,
"learning_rate": 1.7933138310921827e-05,
"loss": 0.49,
"step": 1701
},
{
"epoch": 0.8414287479915956,
"grad_norm": 0.11131879152496378,
"learning_rate": 1.7930767057371955e-05,
"loss": 0.5007,
"step": 1702
},
{
"epoch": 0.8419231244592758,
"grad_norm": 0.10857162634518323,
"learning_rate": 1.792839460132743e-05,
"loss": 0.5032,
"step": 1703
},
{
"epoch": 0.8424175009269559,
"grad_norm": 0.1083606543452569,
"learning_rate": 1.7926020943147974e-05,
"loss": 0.4982,
"step": 1704
},
{
"epoch": 0.842911877394636,
"grad_norm": 0.10512330753116494,
"learning_rate": 1.7923646083193484e-05,
"loss": 0.4984,
"step": 1705
},
{
"epoch": 0.8434062538623162,
"grad_norm": 0.10634419563931162,
"learning_rate": 1.792127002182405e-05,
"loss": 0.4905,
"step": 1706
},
{
"epoch": 0.8439006303299963,
"grad_norm": 0.10665212734072965,
"learning_rate": 1.791889275939994e-05,
"loss": 0.4956,
"step": 1707
},
{
"epoch": 0.8443950067976764,
"grad_norm": 0.10740651535022396,
"learning_rate": 1.7916514296281603e-05,
"loss": 0.4868,
"step": 1708
},
{
"epoch": 0.8448893832653566,
"grad_norm": 0.11100290892723799,
"learning_rate": 1.7914134632829667e-05,
"loss": 0.5137,
"step": 1709
},
{
"epoch": 0.8453837597330367,
"grad_norm": 0.1085001972342328,
"learning_rate": 1.7911753769404954e-05,
"loss": 0.496,
"step": 1710
},
{
"epoch": 0.8458781362007168,
"grad_norm": 1.2243684700343822,
"learning_rate": 1.7909371706368458e-05,
"loss": 0.5139,
"step": 1711
},
{
"epoch": 0.846372512668397,
"grad_norm": 0.11312916794932967,
"learning_rate": 1.7906988444081353e-05,
"loss": 0.4793,
"step": 1712
},
{
"epoch": 0.8468668891360771,
"grad_norm": 0.1244699272881071,
"learning_rate": 1.7904603982905004e-05,
"loss": 0.4915,
"step": 1713
},
{
"epoch": 0.8473612656037572,
"grad_norm": 0.12323429171778284,
"learning_rate": 1.7902218323200948e-05,
"loss": 0.4969,
"step": 1714
},
{
"epoch": 0.8478556420714374,
"grad_norm": 0.12747324427986914,
"learning_rate": 1.789983146533091e-05,
"loss": 0.4964,
"step": 1715
},
{
"epoch": 0.8483500185391175,
"grad_norm": 0.11753104094590212,
"learning_rate": 1.7897443409656792e-05,
"loss": 0.492,
"step": 1716
},
{
"epoch": 0.8488443950067976,
"grad_norm": 0.12683755391030652,
"learning_rate": 1.789505415654069e-05,
"loss": 0.5053,
"step": 1717
},
{
"epoch": 0.8493387714744778,
"grad_norm": 0.3360468911586904,
"learning_rate": 1.789266370634486e-05,
"loss": 0.5162,
"step": 1718
},
{
"epoch": 0.8498331479421579,
"grad_norm": 0.7269546013866,
"learning_rate": 1.789027205943176e-05,
"loss": 0.4807,
"step": 1719
},
{
"epoch": 0.850327524409838,
"grad_norm": 0.1250921275366352,
"learning_rate": 1.7887879216164016e-05,
"loss": 0.4844,
"step": 1720
},
{
"epoch": 0.8508219008775182,
"grad_norm": 0.13161657494189286,
"learning_rate": 1.7885485176904446e-05,
"loss": 0.5219,
"step": 1721
},
{
"epoch": 0.8513162773451983,
"grad_norm": 0.1139767979735568,
"learning_rate": 1.7883089942016035e-05,
"loss": 0.4648,
"step": 1722
},
{
"epoch": 0.8518106538128785,
"grad_norm": 0.12186198090765289,
"learning_rate": 1.788069351186197e-05,
"loss": 0.498,
"step": 1723
},
{
"epoch": 0.8523050302805586,
"grad_norm": 0.11318338550538261,
"learning_rate": 1.78782958868056e-05,
"loss": 0.5101,
"step": 1724
},
{
"epoch": 0.8527994067482387,
"grad_norm": 0.12315552035787314,
"learning_rate": 1.7875897067210463e-05,
"loss": 0.4922,
"step": 1725
},
{
"epoch": 0.853293783215919,
"grad_norm": 0.11395002963095337,
"learning_rate": 1.7873497053440277e-05,
"loss": 0.5275,
"step": 1726
},
{
"epoch": 0.8537881596835991,
"grad_norm": 0.11648149307274194,
"learning_rate": 1.787109584585894e-05,
"loss": 0.5128,
"step": 1727
},
{
"epoch": 0.8542825361512792,
"grad_norm": 0.13348015328964954,
"learning_rate": 1.786869344483054e-05,
"loss": 0.5254,
"step": 1728
},
{
"epoch": 0.8547769126189594,
"grad_norm": 0.11061986367934169,
"learning_rate": 1.7866289850719335e-05,
"loss": 0.5021,
"step": 1729
},
{
"epoch": 0.8552712890866395,
"grad_norm": 0.126841114467787,
"learning_rate": 1.7863885063889766e-05,
"loss": 0.5085,
"step": 1730
},
{
"epoch": 0.8557656655543197,
"grad_norm": 0.11827769759585069,
"learning_rate": 1.7861479084706457e-05,
"loss": 0.5076,
"step": 1731
},
{
"epoch": 0.8562600420219998,
"grad_norm": 0.1281572597854264,
"learning_rate": 1.7859071913534213e-05,
"loss": 0.4928,
"step": 1732
},
{
"epoch": 0.8567544184896799,
"grad_norm": 0.12999774137658435,
"learning_rate": 1.7856663550738017e-05,
"loss": 0.5325,
"step": 1733
},
{
"epoch": 0.8572487949573601,
"grad_norm": 0.11867522495071973,
"learning_rate": 1.7854253996683036e-05,
"loss": 0.5222,
"step": 1734
},
{
"epoch": 0.8577431714250402,
"grad_norm": 0.11323648741955296,
"learning_rate": 1.7851843251734616e-05,
"loss": 0.5283,
"step": 1735
},
{
"epoch": 0.8582375478927203,
"grad_norm": 0.11513533810151373,
"learning_rate": 1.7849431316258284e-05,
"loss": 0.4794,
"step": 1736
},
{
"epoch": 0.8587319243604005,
"grad_norm": 0.11453544363850032,
"learning_rate": 1.784701819061975e-05,
"loss": 0.5457,
"step": 1737
},
{
"epoch": 0.8592263008280806,
"grad_norm": 0.10607532685720836,
"learning_rate": 1.7844603875184897e-05,
"loss": 0.4784,
"step": 1738
},
{
"epoch": 0.8597206772957607,
"grad_norm": 0.26655559379641325,
"learning_rate": 1.7842188370319796e-05,
"loss": 0.526,
"step": 1739
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.10823026217830549,
"learning_rate": 1.783977167639069e-05,
"loss": 0.5078,
"step": 1740
},
{
"epoch": 0.860709430231121,
"grad_norm": 0.11043701117478097,
"learning_rate": 1.7837353793764022e-05,
"loss": 0.4752,
"step": 1741
},
{
"epoch": 0.8612038066988011,
"grad_norm": 0.3078288606530105,
"learning_rate": 1.7834934722806384e-05,
"loss": 0.5673,
"step": 1742
},
{
"epoch": 0.8616981831664813,
"grad_norm": 0.13400451029436342,
"learning_rate": 1.7832514463884577e-05,
"loss": 0.5305,
"step": 1743
},
{
"epoch": 0.8621925596341614,
"grad_norm": 0.42049328094879007,
"learning_rate": 1.7830093017365563e-05,
"loss": 0.5005,
"step": 1744
},
{
"epoch": 0.8626869361018416,
"grad_norm": 0.23431402341819502,
"learning_rate": 1.78276703836165e-05,
"loss": 0.4988,
"step": 1745
},
{
"epoch": 0.8631813125695217,
"grad_norm": 0.24226075145704695,
"learning_rate": 1.7825246563004707e-05,
"loss": 0.5211,
"step": 1746
},
{
"epoch": 0.8636756890372018,
"grad_norm": 0.8960849443043089,
"learning_rate": 1.78228215558977e-05,
"loss": 0.6522,
"step": 1747
},
{
"epoch": 0.864170065504882,
"grad_norm": 0.10887186434150774,
"learning_rate": 1.7820395362663166e-05,
"loss": 0.471,
"step": 1748
},
{
"epoch": 0.8646644419725621,
"grad_norm": 0.11709153264826068,
"learning_rate": 1.7817967983668975e-05,
"loss": 0.5168,
"step": 1749
},
{
"epoch": 0.8651588184402422,
"grad_norm": 0.2257677807230984,
"learning_rate": 1.7815539419283178e-05,
"loss": 0.4875,
"step": 1750
},
{
"epoch": 0.8656531949079224,
"grad_norm": 0.11283154728279766,
"learning_rate": 1.7813109669874e-05,
"loss": 0.4922,
"step": 1751
},
{
"epoch": 0.8661475713756025,
"grad_norm": 4.584746355740341,
"learning_rate": 1.781067873580985e-05,
"loss": 0.5849,
"step": 1752
},
{
"epoch": 0.8666419478432826,
"grad_norm": 0.14978648134394623,
"learning_rate": 1.7808246617459316e-05,
"loss": 0.5104,
"step": 1753
},
{
"epoch": 0.8671363243109628,
"grad_norm": 0.19135651628010053,
"learning_rate": 1.780581331519117e-05,
"loss": 0.5367,
"step": 1754
},
{
"epoch": 0.8676307007786429,
"grad_norm": 0.12749690282323387,
"learning_rate": 1.7803378829374353e-05,
"loss": 0.4984,
"step": 1755
},
{
"epoch": 0.868125077246323,
"grad_norm": 0.13577664920085847,
"learning_rate": 1.7800943160377993e-05,
"loss": 0.5215,
"step": 1756
},
{
"epoch": 0.8686194537140032,
"grad_norm": 0.3006726745252358,
"learning_rate": 1.7798506308571398e-05,
"loss": 0.4995,
"step": 1757
},
{
"epoch": 0.8691138301816833,
"grad_norm": 0.4758679751518057,
"learning_rate": 1.779606827432405e-05,
"loss": 0.4773,
"step": 1758
},
{
"epoch": 0.8696082066493634,
"grad_norm": 0.22549345782975044,
"learning_rate": 1.7793629058005617e-05,
"loss": 0.5367,
"step": 1759
},
{
"epoch": 0.8701025831170436,
"grad_norm": 0.12423098240418684,
"learning_rate": 1.7791188659985942e-05,
"loss": 0.5288,
"step": 1760
},
{
"epoch": 0.8705969595847237,
"grad_norm": 0.1605773465615925,
"learning_rate": 1.7788747080635046e-05,
"loss": 0.5155,
"step": 1761
},
{
"epoch": 0.8710913360524039,
"grad_norm": 0.19898052155681914,
"learning_rate": 1.7786304320323134e-05,
"loss": 0.5016,
"step": 1762
},
{
"epoch": 0.8715857125200841,
"grad_norm": 0.1158858400468247,
"learning_rate": 1.7783860379420584e-05,
"loss": 0.4846,
"step": 1763
},
{
"epoch": 0.8720800889877642,
"grad_norm": 0.12828046485299696,
"learning_rate": 1.7781415258297957e-05,
"loss": 0.5335,
"step": 1764
},
{
"epoch": 0.8725744654554444,
"grad_norm": 0.3675072020992055,
"learning_rate": 1.777896895732599e-05,
"loss": 0.5345,
"step": 1765
},
{
"epoch": 0.8730688419231245,
"grad_norm": 0.13355981249097615,
"learning_rate": 1.7776521476875608e-05,
"loss": 0.4867,
"step": 1766
},
{
"epoch": 0.8735632183908046,
"grad_norm": 0.11608431243776694,
"learning_rate": 1.77740728173179e-05,
"loss": 0.5102,
"step": 1767
},
{
"epoch": 0.8740575948584848,
"grad_norm": 0.1327157647855833,
"learning_rate": 1.7771622979024145e-05,
"loss": 0.5148,
"step": 1768
},
{
"epoch": 0.8745519713261649,
"grad_norm": 0.12463925138006396,
"learning_rate": 1.7769171962365797e-05,
"loss": 0.5131,
"step": 1769
},
{
"epoch": 0.875046347793845,
"grad_norm": 0.12012616191647382,
"learning_rate": 1.776671976771449e-05,
"loss": 0.4675,
"step": 1770
},
{
"epoch": 0.8755407242615252,
"grad_norm": 1.0170760269696837,
"learning_rate": 1.7764266395442033e-05,
"loss": 0.5192,
"step": 1771
},
{
"epoch": 0.8760351007292053,
"grad_norm": 0.5217149667646493,
"learning_rate": 1.776181184592042e-05,
"loss": 0.5334,
"step": 1772
},
{
"epoch": 0.8765294771968855,
"grad_norm": 0.12147529825814805,
"learning_rate": 1.7759356119521815e-05,
"loss": 0.4969,
"step": 1773
},
{
"epoch": 0.8770238536645656,
"grad_norm": 0.12264995681883925,
"learning_rate": 1.775689921661857e-05,
"loss": 0.5344,
"step": 1774
},
{
"epoch": 0.8775182301322457,
"grad_norm": 0.13197572092637244,
"learning_rate": 1.7754441137583205e-05,
"loss": 0.4921,
"step": 1775
},
{
"epoch": 0.8780126065999259,
"grad_norm": 0.44988261332279533,
"learning_rate": 1.7751981882788427e-05,
"loss": 0.5034,
"step": 1776
},
{
"epoch": 0.878506983067606,
"grad_norm": 0.2872520854841971,
"learning_rate": 1.774952145260712e-05,
"loss": 0.5247,
"step": 1777
},
{
"epoch": 0.8790013595352861,
"grad_norm": 0.12242724128453643,
"learning_rate": 1.774705984741234e-05,
"loss": 0.4789,
"step": 1778
},
{
"epoch": 0.8794957360029663,
"grad_norm": 0.12271014313191918,
"learning_rate": 1.7744597067577327e-05,
"loss": 0.5275,
"step": 1779
},
{
"epoch": 0.8799901124706464,
"grad_norm": 0.1467654676637994,
"learning_rate": 1.7742133113475497e-05,
"loss": 0.5068,
"step": 1780
},
{
"epoch": 0.8804844889383265,
"grad_norm": 0.21488255733943284,
"learning_rate": 1.7739667985480447e-05,
"loss": 0.5099,
"step": 1781
},
{
"epoch": 0.8809788654060067,
"grad_norm": 0.17480823763405012,
"learning_rate": 1.773720168396595e-05,
"loss": 0.5495,
"step": 1782
},
{
"epoch": 0.8814732418736868,
"grad_norm": 0.11327904316257556,
"learning_rate": 1.773473420930595e-05,
"loss": 0.4973,
"step": 1783
},
{
"epoch": 0.8819676183413669,
"grad_norm": 0.12833939037776784,
"learning_rate": 1.7732265561874583e-05,
"loss": 0.5292,
"step": 1784
},
{
"epoch": 0.8824619948090471,
"grad_norm": 0.12290260743088498,
"learning_rate": 1.7729795742046148e-05,
"loss": 0.5104,
"step": 1785
},
{
"epoch": 0.8829563712767272,
"grad_norm": 0.12027311744747048,
"learning_rate": 1.772732475019514e-05,
"loss": 0.5594,
"step": 1786
},
{
"epoch": 0.8834507477444073,
"grad_norm": 0.3223131447512736,
"learning_rate": 1.772485258669621e-05,
"loss": 0.5682,
"step": 1787
},
{
"epoch": 0.8839451242120875,
"grad_norm": 0.12003467823774631,
"learning_rate": 1.77223792519242e-05,
"loss": 0.5064,
"step": 1788
},
{
"epoch": 0.8844395006797676,
"grad_norm": 0.17700163505328412,
"learning_rate": 1.771990474625413e-05,
"loss": 0.5203,
"step": 1789
},
{
"epoch": 0.8849338771474478,
"grad_norm": 0.1393223699140783,
"learning_rate": 1.7717429070061195e-05,
"loss": 0.4862,
"step": 1790
},
{
"epoch": 0.8854282536151279,
"grad_norm": 0.11305323135452838,
"learning_rate": 1.771495222372076e-05,
"loss": 0.5069,
"step": 1791
},
{
"epoch": 0.885922630082808,
"grad_norm": 0.18108600631028812,
"learning_rate": 1.771247420760838e-05,
"loss": 0.533,
"step": 1792
},
{
"epoch": 0.8864170065504882,
"grad_norm": 0.13485386523217516,
"learning_rate": 1.770999502209978e-05,
"loss": 0.5119,
"step": 1793
},
{
"epoch": 0.8869113830181683,
"grad_norm": 0.13340509933039943,
"learning_rate": 1.7707514667570865e-05,
"loss": 0.5072,
"step": 1794
},
{
"epoch": 0.8874057594858484,
"grad_norm": 0.12383696887281626,
"learning_rate": 1.770503314439772e-05,
"loss": 0.5056,
"step": 1795
},
{
"epoch": 0.8879001359535286,
"grad_norm": 0.12348126412325203,
"learning_rate": 1.7702550452956593e-05,
"loss": 0.518,
"step": 1796
},
{
"epoch": 0.8883945124212087,
"grad_norm": 1.0286680857735078,
"learning_rate": 1.770006659362393e-05,
"loss": 0.5677,
"step": 1797
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.17632147635276132,
"learning_rate": 1.7697581566776338e-05,
"loss": 0.4974,
"step": 1798
},
{
"epoch": 0.889383265356569,
"grad_norm": 0.11838609853636962,
"learning_rate": 1.7695095372790607e-05,
"loss": 0.4891,
"step": 1799
},
{
"epoch": 0.8898776418242491,
"grad_norm": 0.11888520676921964,
"learning_rate": 1.7692608012043707e-05,
"loss": 0.5113,
"step": 1800
},
{
"epoch": 0.8903720182919294,
"grad_norm": 0.12305684293306558,
"learning_rate": 1.769011948491278e-05,
"loss": 0.4985,
"step": 1801
},
{
"epoch": 0.8908663947596095,
"grad_norm": 0.12229216878393678,
"learning_rate": 1.7687629791775146e-05,
"loss": 0.482,
"step": 1802
},
{
"epoch": 0.8913607712272896,
"grad_norm": 0.12103185731174171,
"learning_rate": 1.76851389330083e-05,
"loss": 0.4759,
"step": 1803
},
{
"epoch": 0.8918551476949698,
"grad_norm": 0.1152589623845976,
"learning_rate": 1.7682646908989923e-05,
"loss": 0.5072,
"step": 1804
},
{
"epoch": 0.8923495241626499,
"grad_norm": 0.5062846137283104,
"learning_rate": 1.7680153720097856e-05,
"loss": 0.5209,
"step": 1805
},
{
"epoch": 0.89284390063033,
"grad_norm": 0.12314995690402276,
"learning_rate": 1.7677659366710134e-05,
"loss": 0.507,
"step": 1806
},
{
"epoch": 0.8933382770980102,
"grad_norm": 0.15431080908421918,
"learning_rate": 1.767516384920496e-05,
"loss": 0.5061,
"step": 1807
},
{
"epoch": 0.8938326535656903,
"grad_norm": 0.13917563766834584,
"learning_rate": 1.7672667167960705e-05,
"loss": 0.4903,
"step": 1808
},
{
"epoch": 0.8943270300333704,
"grad_norm": 0.17196650997768648,
"learning_rate": 1.767016932335594e-05,
"loss": 0.533,
"step": 1809
},
{
"epoch": 0.8948214065010506,
"grad_norm": 0.37664905924999414,
"learning_rate": 1.7667670315769388e-05,
"loss": 0.5174,
"step": 1810
},
{
"epoch": 0.8953157829687307,
"grad_norm": 0.15330320906809627,
"learning_rate": 1.7665170145579965e-05,
"loss": 0.4906,
"step": 1811
},
{
"epoch": 0.8958101594364108,
"grad_norm": 0.12903960806881187,
"learning_rate": 1.7662668813166753e-05,
"loss": 0.5089,
"step": 1812
},
{
"epoch": 0.896304535904091,
"grad_norm": 0.11849093258423142,
"learning_rate": 1.7660166318909014e-05,
"loss": 0.5158,
"step": 1813
},
{
"epoch": 0.8967989123717711,
"grad_norm": 0.12345395768885428,
"learning_rate": 1.7657662663186186e-05,
"loss": 0.4756,
"step": 1814
},
{
"epoch": 0.8972932888394513,
"grad_norm": 0.13274633743696587,
"learning_rate": 1.7655157846377885e-05,
"loss": 0.4611,
"step": 1815
},
{
"epoch": 0.8977876653071314,
"grad_norm": 0.13650439359819594,
"learning_rate": 1.7652651868863904e-05,
"loss": 0.4797,
"step": 1816
},
{
"epoch": 0.8982820417748115,
"grad_norm": 0.5528483131285505,
"learning_rate": 1.7650144731024205e-05,
"loss": 0.4871,
"step": 1817
},
{
"epoch": 0.8987764182424917,
"grad_norm": 0.12649597930610532,
"learning_rate": 1.764763643323893e-05,
"loss": 0.4944,
"step": 1818
},
{
"epoch": 0.8992707947101718,
"grad_norm": 0.11671753726326113,
"learning_rate": 1.7645126975888396e-05,
"loss": 0.4925,
"step": 1819
},
{
"epoch": 0.8997651711778519,
"grad_norm": 0.20855142076496525,
"learning_rate": 1.76426163593531e-05,
"loss": 0.5024,
"step": 1820
},
{
"epoch": 0.9002595476455321,
"grad_norm": 0.2040363197641712,
"learning_rate": 1.7640104584013715e-05,
"loss": 0.5134,
"step": 1821
},
{
"epoch": 0.9007539241132122,
"grad_norm": 0.1450455954963181,
"learning_rate": 1.7637591650251077e-05,
"loss": 0.5271,
"step": 1822
},
{
"epoch": 0.9012483005808923,
"grad_norm": 0.1282218798117356,
"learning_rate": 1.7635077558446217e-05,
"loss": 0.4894,
"step": 1823
},
{
"epoch": 0.9017426770485725,
"grad_norm": 0.11974918750525368,
"learning_rate": 1.7632562308980327e-05,
"loss": 0.4979,
"step": 1824
},
{
"epoch": 0.9022370535162526,
"grad_norm": 0.14043902691528726,
"learning_rate": 1.763004590223478e-05,
"loss": 0.5226,
"step": 1825
},
{
"epoch": 0.9027314299839327,
"grad_norm": 2.5761041418286466,
"learning_rate": 1.762752833859112e-05,
"loss": 0.5311,
"step": 1826
},
{
"epoch": 0.9032258064516129,
"grad_norm": 1.060721591661206,
"learning_rate": 1.7625009618431077e-05,
"loss": 0.5197,
"step": 1827
},
{
"epoch": 0.903720182919293,
"grad_norm": 0.12327637135436184,
"learning_rate": 1.7622489742136546e-05,
"loss": 0.4707,
"step": 1828
},
{
"epoch": 0.9042145593869731,
"grad_norm": 0.30405681660734696,
"learning_rate": 1.7619968710089597e-05,
"loss": 0.4895,
"step": 1829
},
{
"epoch": 0.9047089358546533,
"grad_norm": 1.2683234294701475,
"learning_rate": 1.7617446522672486e-05,
"loss": 0.6795,
"step": 1830
},
{
"epoch": 0.9052033123223334,
"grad_norm": 0.17433279006428745,
"learning_rate": 1.761492318026763e-05,
"loss": 0.5152,
"step": 1831
},
{
"epoch": 0.9056976887900136,
"grad_norm": 0.1387794589063158,
"learning_rate": 1.7612398683257635e-05,
"loss": 0.4894,
"step": 1832
},
{
"epoch": 0.9061920652576937,
"grad_norm": 0.19088671133934518,
"learning_rate": 1.7609873032025274e-05,
"loss": 0.4791,
"step": 1833
},
{
"epoch": 0.9066864417253738,
"grad_norm": 0.3587381805007322,
"learning_rate": 1.760734622695349e-05,
"loss": 0.5196,
"step": 1834
},
{
"epoch": 0.907180818193054,
"grad_norm": 0.17300375704597998,
"learning_rate": 1.7604818268425412e-05,
"loss": 0.5097,
"step": 1835
},
{
"epoch": 0.9076751946607341,
"grad_norm": 0.15211013826649145,
"learning_rate": 1.760228915682434e-05,
"loss": 0.5161,
"step": 1836
},
{
"epoch": 0.9081695711284142,
"grad_norm": 0.14692391677439792,
"learning_rate": 1.759975889253375e-05,
"loss": 0.4951,
"step": 1837
},
{
"epoch": 0.9086639475960945,
"grad_norm": 0.4797705652446713,
"learning_rate": 1.7597227475937285e-05,
"loss": 0.5006,
"step": 1838
},
{
"epoch": 0.9091583240637746,
"grad_norm": 0.139996144357709,
"learning_rate": 1.7594694907418773e-05,
"loss": 0.4923,
"step": 1839
},
{
"epoch": 0.9096527005314547,
"grad_norm": 0.1500608067767587,
"learning_rate": 1.7592161187362208e-05,
"loss": 0.4743,
"step": 1840
},
{
"epoch": 0.9101470769991349,
"grad_norm": 0.15481415090508824,
"learning_rate": 1.7589626316151767e-05,
"loss": 0.519,
"step": 1841
},
{
"epoch": 0.910641453466815,
"grad_norm": 0.14725150034537396,
"learning_rate": 1.7587090294171797e-05,
"loss": 0.4916,
"step": 1842
},
{
"epoch": 0.9111358299344952,
"grad_norm": 0.1229147776405881,
"learning_rate": 1.7584553121806817e-05,
"loss": 0.5342,
"step": 1843
},
{
"epoch": 0.9116302064021753,
"grad_norm": 0.12773368956451997,
"learning_rate": 1.7582014799441524e-05,
"loss": 0.532,
"step": 1844
},
{
"epoch": 0.9121245828698554,
"grad_norm": 0.11594968526983311,
"learning_rate": 1.757947532746079e-05,
"loss": 0.478,
"step": 1845
},
{
"epoch": 0.9126189593375356,
"grad_norm": 0.13001787482180066,
"learning_rate": 1.757693470624966e-05,
"loss": 0.4991,
"step": 1846
},
{
"epoch": 0.9131133358052157,
"grad_norm": 1.5161068917715557,
"learning_rate": 1.7574392936193354e-05,
"loss": 0.5765,
"step": 1847
},
{
"epoch": 0.9136077122728958,
"grad_norm": 0.12921286420390574,
"learning_rate": 1.757185001767726e-05,
"loss": 0.4781,
"step": 1848
},
{
"epoch": 0.914102088740576,
"grad_norm": 0.13133881221020705,
"learning_rate": 1.756930595108695e-05,
"loss": 0.5213,
"step": 1849
},
{
"epoch": 0.9145964652082561,
"grad_norm": 0.12403487654487093,
"learning_rate": 1.7566760736808167e-05,
"loss": 0.4843,
"step": 1850
},
{
"epoch": 0.9150908416759362,
"grad_norm": 0.13290178828957477,
"learning_rate": 1.7564214375226822e-05,
"loss": 0.5257,
"step": 1851
},
{
"epoch": 0.9155852181436164,
"grad_norm": 0.12012537384128678,
"learning_rate": 1.7561666866729006e-05,
"loss": 0.5005,
"step": 1852
},
{
"epoch": 0.9160795946112965,
"grad_norm": 0.1526490274982488,
"learning_rate": 1.755911821170099e-05,
"loss": 0.4951,
"step": 1853
},
{
"epoch": 0.9165739710789766,
"grad_norm": 0.20426888910256433,
"learning_rate": 1.75565684105292e-05,
"loss": 0.5491,
"step": 1854
},
{
"epoch": 0.9170683475466568,
"grad_norm": 0.11561260157843543,
"learning_rate": 1.755401746360025e-05,
"loss": 0.4777,
"step": 1855
},
{
"epoch": 0.9175627240143369,
"grad_norm": 0.13149558050145174,
"learning_rate": 1.7551465371300928e-05,
"loss": 0.5296,
"step": 1856
},
{
"epoch": 0.918057100482017,
"grad_norm": 0.12063834227219081,
"learning_rate": 1.7548912134018193e-05,
"loss": 0.5113,
"step": 1857
},
{
"epoch": 0.9185514769496972,
"grad_norm": 0.1313153502308165,
"learning_rate": 1.7546357752139173e-05,
"loss": 0.5152,
"step": 1858
},
{
"epoch": 0.9190458534173773,
"grad_norm": 0.7414853851454106,
"learning_rate": 1.7543802226051178e-05,
"loss": 0.5256,
"step": 1859
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.12521140951811835,
"learning_rate": 1.754124555614168e-05,
"loss": 0.4805,
"step": 1860
},
{
"epoch": 0.9200346063527376,
"grad_norm": 0.19994183348982433,
"learning_rate": 1.753868774279834e-05,
"loss": 0.5025,
"step": 1861
},
{
"epoch": 0.9205289828204177,
"grad_norm": 0.23925034715549473,
"learning_rate": 1.753612878640898e-05,
"loss": 0.4917,
"step": 1862
},
{
"epoch": 0.9210233592880979,
"grad_norm": 0.19454782977805102,
"learning_rate": 1.75335686873616e-05,
"loss": 0.5078,
"step": 1863
},
{
"epoch": 0.921517735755778,
"grad_norm": 0.18747036882065327,
"learning_rate": 1.7531007446044366e-05,
"loss": 0.509,
"step": 1864
},
{
"epoch": 0.9220121122234581,
"grad_norm": 0.4317297481546159,
"learning_rate": 1.7528445062845636e-05,
"loss": 0.52,
"step": 1865
},
{
"epoch": 0.9225064886911383,
"grad_norm": 0.16104585718087308,
"learning_rate": 1.752588153815392e-05,
"loss": 0.505,
"step": 1866
},
{
"epoch": 0.9230008651588184,
"grad_norm": 0.1418498184350066,
"learning_rate": 1.752331687235791e-05,
"loss": 0.5287,
"step": 1867
},
{
"epoch": 0.9234952416264985,
"grad_norm": 0.7494261458126774,
"learning_rate": 1.7520751065846477e-05,
"loss": 0.5255,
"step": 1868
},
{
"epoch": 0.9239896180941787,
"grad_norm": 0.16703063705870683,
"learning_rate": 1.7518184119008655e-05,
"loss": 0.5339,
"step": 1869
},
{
"epoch": 0.9244839945618588,
"grad_norm": 1.180953013221965,
"learning_rate": 1.7515616032233652e-05,
"loss": 0.517,
"step": 1870
},
{
"epoch": 0.924978371029539,
"grad_norm": 0.21908337157825428,
"learning_rate": 1.7513046805910855e-05,
"loss": 0.5043,
"step": 1871
},
{
"epoch": 0.9254727474972191,
"grad_norm": 0.15506791859209548,
"learning_rate": 1.751047644042982e-05,
"loss": 0.4773,
"step": 1872
},
{
"epoch": 0.9259671239648992,
"grad_norm": 0.19935540417116476,
"learning_rate": 1.7507904936180275e-05,
"loss": 0.5222,
"step": 1873
},
{
"epoch": 0.9264615004325794,
"grad_norm": 0.1695629208357122,
"learning_rate": 1.7505332293552123e-05,
"loss": 0.5409,
"step": 1874
},
{
"epoch": 0.9269558769002596,
"grad_norm": 0.15409727868861264,
"learning_rate": 1.750275851293544e-05,
"loss": 0.5079,
"step": 1875
},
{
"epoch": 0.9274502533679397,
"grad_norm": 0.25121423907721563,
"learning_rate": 1.750018359472047e-05,
"loss": 0.5597,
"step": 1876
},
{
"epoch": 0.9279446298356199,
"grad_norm": 0.13000387721650458,
"learning_rate": 1.749760753929763e-05,
"loss": 0.4852,
"step": 1877
},
{
"epoch": 0.9284390063033,
"grad_norm": 0.17764476534878504,
"learning_rate": 1.7495030347057516e-05,
"loss": 0.5319,
"step": 1878
},
{
"epoch": 0.9289333827709801,
"grad_norm": 0.13118411831859736,
"learning_rate": 1.7492452018390896e-05,
"loss": 0.5204,
"step": 1879
},
{
"epoch": 0.9294277592386603,
"grad_norm": 0.1311934864022076,
"learning_rate": 1.7489872553688697e-05,
"loss": 0.523,
"step": 1880
},
{
"epoch": 0.9299221357063404,
"grad_norm": 0.13478875537955937,
"learning_rate": 1.7487291953342033e-05,
"loss": 0.4828,
"step": 1881
},
{
"epoch": 0.9304165121740205,
"grad_norm": 0.1466176211176656,
"learning_rate": 1.7484710217742185e-05,
"loss": 0.5207,
"step": 1882
},
{
"epoch": 0.9309108886417007,
"grad_norm": 0.12158025370459037,
"learning_rate": 1.748212734728061e-05,
"loss": 0.4947,
"step": 1883
},
{
"epoch": 0.9314052651093808,
"grad_norm": 0.11522640562827904,
"learning_rate": 1.7479543342348923e-05,
"loss": 0.4626,
"step": 1884
},
{
"epoch": 0.931899641577061,
"grad_norm": 0.19385090396341306,
"learning_rate": 1.7476958203338926e-05,
"loss": 0.511,
"step": 1885
},
{
"epoch": 0.9323940180447411,
"grad_norm": 0.1893012684842483,
"learning_rate": 1.7474371930642594e-05,
"loss": 0.5091,
"step": 1886
},
{
"epoch": 0.9328883945124212,
"grad_norm": 0.12473053600346433,
"learning_rate": 1.7471784524652062e-05,
"loss": 0.5257,
"step": 1887
},
{
"epoch": 0.9333827709801014,
"grad_norm": 0.12727206078656117,
"learning_rate": 1.7469195985759643e-05,
"loss": 0.5327,
"step": 1888
},
{
"epoch": 0.9338771474477815,
"grad_norm": 0.11539507020456165,
"learning_rate": 1.7466606314357823e-05,
"loss": 0.4942,
"step": 1889
},
{
"epoch": 0.9343715239154616,
"grad_norm": 0.11499009701213586,
"learning_rate": 1.7464015510839257e-05,
"loss": 0.5296,
"step": 1890
},
{
"epoch": 0.9348659003831418,
"grad_norm": 0.12162338791883734,
"learning_rate": 1.7461423575596775e-05,
"loss": 0.5098,
"step": 1891
},
{
"epoch": 0.9353602768508219,
"grad_norm": 0.10824243982892294,
"learning_rate": 1.7458830509023377e-05,
"loss": 0.5042,
"step": 1892
},
{
"epoch": 0.935854653318502,
"grad_norm": 0.21075594858600552,
"learning_rate": 1.745623631151223e-05,
"loss": 0.5029,
"step": 1893
},
{
"epoch": 0.9363490297861822,
"grad_norm": 0.10493641955941883,
"learning_rate": 1.745364098345668e-05,
"loss": 0.479,
"step": 1894
},
{
"epoch": 0.9368434062538623,
"grad_norm": 0.12191869025125324,
"learning_rate": 1.745104452525024e-05,
"loss": 0.4691,
"step": 1895
},
{
"epoch": 0.9373377827215424,
"grad_norm": 0.11317245630389668,
"learning_rate": 1.7448446937286594e-05,
"loss": 0.5213,
"step": 1896
},
{
"epoch": 0.9378321591892226,
"grad_norm": 0.1389703115776053,
"learning_rate": 1.7445848219959606e-05,
"loss": 0.522,
"step": 1897
},
{
"epoch": 0.9383265356569027,
"grad_norm": 0.10623300575292441,
"learning_rate": 1.7443248373663293e-05,
"loss": 0.5023,
"step": 1898
},
{
"epoch": 0.9388209121245829,
"grad_norm": 0.1091922500625693,
"learning_rate": 1.7440647398791862e-05,
"loss": 0.5255,
"step": 1899
},
{
"epoch": 0.939315288592263,
"grad_norm": 0.10858200338278615,
"learning_rate": 1.7438045295739678e-05,
"loss": 0.5149,
"step": 1900
},
{
"epoch": 0.9398096650599431,
"grad_norm": 0.19913655207120312,
"learning_rate": 1.7435442064901288e-05,
"loss": 0.5223,
"step": 1901
},
{
"epoch": 0.9403040415276233,
"grad_norm": 0.15381591522553295,
"learning_rate": 1.74328377066714e-05,
"loss": 0.5163,
"step": 1902
},
{
"epoch": 0.9407984179953034,
"grad_norm": 0.11518841359304881,
"learning_rate": 1.74302322214449e-05,
"loss": 0.48,
"step": 1903
},
{
"epoch": 0.9412927944629835,
"grad_norm": 0.11303571052170001,
"learning_rate": 1.742762560961684e-05,
"loss": 0.5193,
"step": 1904
},
{
"epoch": 0.9417871709306637,
"grad_norm": 0.12329529337776252,
"learning_rate": 1.7425017871582442e-05,
"loss": 0.4752,
"step": 1905
},
{
"epoch": 0.9422815473983438,
"grad_norm": 0.12490261039271161,
"learning_rate": 1.742240900773711e-05,
"loss": 0.5105,
"step": 1906
},
{
"epoch": 0.9427759238660239,
"grad_norm": 0.12985617353152254,
"learning_rate": 1.7419799018476404e-05,
"loss": 0.5084,
"step": 1907
},
{
"epoch": 0.9432703003337041,
"grad_norm": 0.11412593496346474,
"learning_rate": 1.741718790419606e-05,
"loss": 0.5226,
"step": 1908
},
{
"epoch": 0.9437646768013842,
"grad_norm": 0.12524841852806964,
"learning_rate": 1.741457566529199e-05,
"loss": 0.5272,
"step": 1909
},
{
"epoch": 0.9442590532690643,
"grad_norm": 0.10401841377996815,
"learning_rate": 1.741196230216027e-05,
"loss": 0.4935,
"step": 1910
},
{
"epoch": 0.9447534297367445,
"grad_norm": 0.12193798770044272,
"learning_rate": 1.7409347815197148e-05,
"loss": 0.4982,
"step": 1911
},
{
"epoch": 0.9452478062044247,
"grad_norm": 0.11508412454073827,
"learning_rate": 1.7406732204799046e-05,
"loss": 0.4871,
"step": 1912
},
{
"epoch": 0.9457421826721049,
"grad_norm": 0.12061117786931354,
"learning_rate": 1.7404115471362552e-05,
"loss": 0.4752,
"step": 1913
},
{
"epoch": 0.946236559139785,
"grad_norm": 0.10961598831455957,
"learning_rate": 1.740149761528442e-05,
"loss": 0.4923,
"step": 1914
},
{
"epoch": 0.9467309356074651,
"grad_norm": 0.12227015613830369,
"learning_rate": 1.739887863696159e-05,
"loss": 0.5023,
"step": 1915
},
{
"epoch": 0.9472253120751453,
"grad_norm": 0.11268947239054959,
"learning_rate": 1.7396258536791152e-05,
"loss": 0.5139,
"step": 1916
},
{
"epoch": 0.9477196885428254,
"grad_norm": 0.10234013397968253,
"learning_rate": 1.7393637315170385e-05,
"loss": 0.4521,
"step": 1917
},
{
"epoch": 0.9482140650105055,
"grad_norm": 0.10437322764788698,
"learning_rate": 1.739101497249672e-05,
"loss": 0.5,
"step": 1918
},
{
"epoch": 0.9487084414781857,
"grad_norm": 0.11221036565002236,
"learning_rate": 1.7388391509167774e-05,
"loss": 0.5322,
"step": 1919
},
{
"epoch": 0.9492028179458658,
"grad_norm": 0.3851517681659481,
"learning_rate": 1.7385766925581327e-05,
"loss": 0.5564,
"step": 1920
},
{
"epoch": 0.9496971944135459,
"grad_norm": 0.1092156571736689,
"learning_rate": 1.7383141222135324e-05,
"loss": 0.5056,
"step": 1921
},
{
"epoch": 0.9501915708812261,
"grad_norm": 0.11397207022732253,
"learning_rate": 1.7380514399227888e-05,
"loss": 0.5069,
"step": 1922
},
{
"epoch": 0.9506859473489062,
"grad_norm": 0.10708935176480844,
"learning_rate": 1.737788645725731e-05,
"loss": 0.5099,
"step": 1923
},
{
"epoch": 0.9511803238165863,
"grad_norm": 0.11174290099175418,
"learning_rate": 1.7375257396622044e-05,
"loss": 0.4647,
"step": 1924
},
{
"epoch": 0.9516747002842665,
"grad_norm": 0.10228819612827451,
"learning_rate": 1.7372627217720723e-05,
"loss": 0.4969,
"step": 1925
},
{
"epoch": 0.9521690767519466,
"grad_norm": 0.11119526529771495,
"learning_rate": 1.736999592095214e-05,
"loss": 0.5089,
"step": 1926
},
{
"epoch": 0.9526634532196268,
"grad_norm": 0.11576717538889876,
"learning_rate": 1.736736350671527e-05,
"loss": 0.5195,
"step": 1927
},
{
"epoch": 0.9531578296873069,
"grad_norm": 0.10580780806170863,
"learning_rate": 1.736472997540925e-05,
"loss": 0.4939,
"step": 1928
},
{
"epoch": 0.953652206154987,
"grad_norm": 0.11616727243047428,
"learning_rate": 1.736209532743338e-05,
"loss": 0.4804,
"step": 1929
},
{
"epoch": 0.9541465826226672,
"grad_norm": 1.691689406785698,
"learning_rate": 1.735945956318714e-05,
"loss": 0.6641,
"step": 1930
},
{
"epoch": 0.9546409590903473,
"grad_norm": 0.11892455888951561,
"learning_rate": 1.7356822683070174e-05,
"loss": 0.5147,
"step": 1931
},
{
"epoch": 0.9551353355580274,
"grad_norm": 0.12140653820799666,
"learning_rate": 1.7354184687482294e-05,
"loss": 0.4922,
"step": 1932
},
{
"epoch": 0.9556297120257076,
"grad_norm": 0.12198184839881038,
"learning_rate": 1.735154557682349e-05,
"loss": 0.5008,
"step": 1933
},
{
"epoch": 0.9561240884933877,
"grad_norm": 0.16679274634734537,
"learning_rate": 1.734890535149391e-05,
"loss": 0.4912,
"step": 1934
},
{
"epoch": 0.9566184649610678,
"grad_norm": 0.13330015596074277,
"learning_rate": 1.7346264011893876e-05,
"loss": 0.5466,
"step": 1935
},
{
"epoch": 0.957112841428748,
"grad_norm": 0.12294646429759998,
"learning_rate": 1.7343621558423877e-05,
"loss": 0.4901,
"step": 1936
},
{
"epoch": 0.9576072178964281,
"grad_norm": 0.12642744747068507,
"learning_rate": 1.7340977991484577e-05,
"loss": 0.5065,
"step": 1937
},
{
"epoch": 0.9581015943641082,
"grad_norm": 0.11914855150016194,
"learning_rate": 1.73383333114768e-05,
"loss": 0.4663,
"step": 1938
},
{
"epoch": 0.9585959708317884,
"grad_norm": 0.10767832841003909,
"learning_rate": 1.7335687518801543e-05,
"loss": 0.4892,
"step": 1939
},
{
"epoch": 0.9590903472994685,
"grad_norm": 0.11468953172156389,
"learning_rate": 1.7333040613859974e-05,
"loss": 0.5078,
"step": 1940
},
{
"epoch": 0.9595847237671487,
"grad_norm": 0.18927310316766086,
"learning_rate": 1.7330392597053423e-05,
"loss": 0.489,
"step": 1941
},
{
"epoch": 0.9600791002348288,
"grad_norm": 0.1087547362572362,
"learning_rate": 1.73277434687834e-05,
"loss": 0.4704,
"step": 1942
},
{
"epoch": 0.9605734767025089,
"grad_norm": 0.1239077083642393,
"learning_rate": 1.732509322945157e-05,
"loss": 0.5099,
"step": 1943
},
{
"epoch": 0.9610678531701891,
"grad_norm": 0.11837716531608712,
"learning_rate": 1.7322441879459778e-05,
"loss": 0.5173,
"step": 1944
},
{
"epoch": 0.9615622296378692,
"grad_norm": 0.11789629767939079,
"learning_rate": 1.731978941921003e-05,
"loss": 0.5185,
"step": 1945
},
{
"epoch": 0.9620566061055493,
"grad_norm": 0.12116477483128318,
"learning_rate": 1.73171358491045e-05,
"loss": 0.488,
"step": 1946
},
{
"epoch": 0.9625509825732295,
"grad_norm": 0.10940679555107878,
"learning_rate": 1.731448116954554e-05,
"loss": 0.4652,
"step": 1947
},
{
"epoch": 0.9630453590409096,
"grad_norm": 0.11055609525912835,
"learning_rate": 1.7311825380935654e-05,
"loss": 0.4955,
"step": 1948
},
{
"epoch": 0.9635397355085898,
"grad_norm": 0.11213309803710503,
"learning_rate": 1.7309168483677527e-05,
"loss": 0.5158,
"step": 1949
},
{
"epoch": 0.96403411197627,
"grad_norm": 0.10552293599506561,
"learning_rate": 1.7306510478174014e-05,
"loss": 0.4705,
"step": 1950
},
{
"epoch": 0.9645284884439501,
"grad_norm": 0.1004280677668702,
"learning_rate": 1.7303851364828124e-05,
"loss": 0.487,
"step": 1951
},
{
"epoch": 0.9650228649116303,
"grad_norm": 0.10467756226397039,
"learning_rate": 1.7301191144043047e-05,
"loss": 0.4931,
"step": 1952
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.12179055250829078,
"learning_rate": 1.7298529816222134e-05,
"loss": 0.4844,
"step": 1953
},
{
"epoch": 0.9660116178469905,
"grad_norm": 0.10418622863667028,
"learning_rate": 1.7295867381768908e-05,
"loss": 0.4852,
"step": 1954
},
{
"epoch": 0.9665059943146707,
"grad_norm": 0.13423393757765115,
"learning_rate": 1.7293203841087058e-05,
"loss": 0.4989,
"step": 1955
},
{
"epoch": 0.9670003707823508,
"grad_norm": 0.10748168444379566,
"learning_rate": 1.729053919458044e-05,
"loss": 0.5091,
"step": 1956
},
{
"epoch": 0.9674947472500309,
"grad_norm": 0.10477799627917114,
"learning_rate": 1.728787344265308e-05,
"loss": 0.5011,
"step": 1957
},
{
"epoch": 0.9679891237177111,
"grad_norm": 0.1041438144299893,
"learning_rate": 1.728520658570917e-05,
"loss": 0.4804,
"step": 1958
},
{
"epoch": 0.9684835001853912,
"grad_norm": 0.09861380001501449,
"learning_rate": 1.7282538624153066e-05,
"loss": 0.4603,
"step": 1959
},
{
"epoch": 0.9689778766530713,
"grad_norm": 0.12023279548108626,
"learning_rate": 1.7279869558389295e-05,
"loss": 0.5209,
"step": 1960
},
{
"epoch": 0.9694722531207515,
"grad_norm": 0.11511124929974276,
"learning_rate": 1.7277199388822555e-05,
"loss": 0.4771,
"step": 1961
},
{
"epoch": 0.9699666295884316,
"grad_norm": 0.11211926524393473,
"learning_rate": 1.7274528115857707e-05,
"loss": 0.5068,
"step": 1962
},
{
"epoch": 0.9704610060561117,
"grad_norm": 0.11102841999461023,
"learning_rate": 1.727185573989978e-05,
"loss": 0.4602,
"step": 1963
},
{
"epoch": 0.9709553825237919,
"grad_norm": 0.1118241172889847,
"learning_rate": 1.726918226135397e-05,
"loss": 0.5076,
"step": 1964
},
{
"epoch": 0.971449758991472,
"grad_norm": 0.11702215806800176,
"learning_rate": 1.7266507680625638e-05,
"loss": 0.4896,
"step": 1965
},
{
"epoch": 0.9719441354591521,
"grad_norm": 0.11321557780049064,
"learning_rate": 1.726383199812032e-05,
"loss": 0.4935,
"step": 1966
},
{
"epoch": 0.9724385119268323,
"grad_norm": 0.10348936951510636,
"learning_rate": 1.7261155214243706e-05,
"loss": 0.5061,
"step": 1967
},
{
"epoch": 0.9729328883945124,
"grad_norm": 0.10965346281294179,
"learning_rate": 1.725847732940167e-05,
"loss": 0.4877,
"step": 1968
},
{
"epoch": 0.9734272648621926,
"grad_norm": 0.11442649831740459,
"learning_rate": 1.7255798344000235e-05,
"loss": 0.5099,
"step": 1969
},
{
"epoch": 0.9739216413298727,
"grad_norm": 0.10812298196697667,
"learning_rate": 1.725311825844561e-05,
"loss": 0.4853,
"step": 1970
},
{
"epoch": 0.9744160177975528,
"grad_norm": 0.11042316003903552,
"learning_rate": 1.7250437073144146e-05,
"loss": 0.4664,
"step": 1971
},
{
"epoch": 0.974910394265233,
"grad_norm": 0.11744216765088254,
"learning_rate": 1.724775478850239e-05,
"loss": 0.5115,
"step": 1972
},
{
"epoch": 0.9754047707329131,
"grad_norm": 0.11328501363133892,
"learning_rate": 1.724507140492703e-05,
"loss": 0.4911,
"step": 1973
},
{
"epoch": 0.9758991472005932,
"grad_norm": 0.11876973446950796,
"learning_rate": 1.7242386922824935e-05,
"loss": 0.4884,
"step": 1974
},
{
"epoch": 0.9763935236682734,
"grad_norm": 0.33511653986091194,
"learning_rate": 1.7239701342603136e-05,
"loss": 0.5427,
"step": 1975
},
{
"epoch": 0.9768879001359535,
"grad_norm": 0.12270597897230119,
"learning_rate": 1.7237014664668833e-05,
"loss": 0.539,
"step": 1976
},
{
"epoch": 0.9773822766036336,
"grad_norm": 0.1277280608837643,
"learning_rate": 1.7234326889429385e-05,
"loss": 0.5332,
"step": 1977
},
{
"epoch": 0.9778766530713138,
"grad_norm": 0.10748724877128706,
"learning_rate": 1.7231638017292337e-05,
"loss": 0.4766,
"step": 1978
},
{
"epoch": 0.9783710295389939,
"grad_norm": 0.11210248341976858,
"learning_rate": 1.7228948048665375e-05,
"loss": 0.4969,
"step": 1979
},
{
"epoch": 0.978865406006674,
"grad_norm": 0.10494198590786981,
"learning_rate": 1.7226256983956363e-05,
"loss": 0.4895,
"step": 1980
},
{
"epoch": 0.9793597824743542,
"grad_norm": 0.10844195781442616,
"learning_rate": 1.7223564823573337e-05,
"loss": 0.4951,
"step": 1981
},
{
"epoch": 0.9798541589420343,
"grad_norm": 0.10719666072326942,
"learning_rate": 1.7220871567924492e-05,
"loss": 0.4869,
"step": 1982
},
{
"epoch": 0.9803485354097144,
"grad_norm": 0.10364473788297957,
"learning_rate": 1.7218177217418183e-05,
"loss": 0.4992,
"step": 1983
},
{
"epoch": 0.9808429118773946,
"grad_norm": 0.11220376792453318,
"learning_rate": 1.7215481772462944e-05,
"loss": 0.5242,
"step": 1984
},
{
"epoch": 0.9813372883450747,
"grad_norm": 0.10505254502181574,
"learning_rate": 1.721278523346747e-05,
"loss": 0.4901,
"step": 1985
},
{
"epoch": 0.981831664812755,
"grad_norm": 0.11091387928755955,
"learning_rate": 1.721008760084062e-05,
"loss": 0.4729,
"step": 1986
},
{
"epoch": 0.9823260412804351,
"grad_norm": 0.11486155125119268,
"learning_rate": 1.720738887499142e-05,
"loss": 0.5157,
"step": 1987
},
{
"epoch": 0.9828204177481152,
"grad_norm": 0.11313543449359396,
"learning_rate": 1.7204689056329058e-05,
"loss": 0.4905,
"step": 1988
},
{
"epoch": 0.9833147942157954,
"grad_norm": 0.11238098079413154,
"learning_rate": 1.7201988145262897e-05,
"loss": 0.4944,
"step": 1989
},
{
"epoch": 0.9838091706834755,
"grad_norm": 0.11235639279987208,
"learning_rate": 1.7199286142202454e-05,
"loss": 0.5095,
"step": 1990
},
{
"epoch": 0.9843035471511556,
"grad_norm": 0.10545151807922566,
"learning_rate": 1.7196583047557425e-05,
"loss": 0.4985,
"step": 1991
},
{
"epoch": 0.9847979236188358,
"grad_norm": 0.10546865774976306,
"learning_rate": 1.7193878861737657e-05,
"loss": 0.4867,
"step": 1992
},
{
"epoch": 0.9852923000865159,
"grad_norm": 0.10714150743559547,
"learning_rate": 1.7191173585153174e-05,
"loss": 0.4751,
"step": 1993
},
{
"epoch": 0.985786676554196,
"grad_norm": 0.10578871800686891,
"learning_rate": 1.718846721821416e-05,
"loss": 0.5033,
"step": 1994
},
{
"epoch": 0.9862810530218762,
"grad_norm": 0.11075785366564267,
"learning_rate": 1.718575976133096e-05,
"loss": 0.4747,
"step": 1995
},
{
"epoch": 0.9867754294895563,
"grad_norm": 0.11160589571503844,
"learning_rate": 1.7183051214914096e-05,
"loss": 0.5469,
"step": 1996
},
{
"epoch": 0.9872698059572365,
"grad_norm": 0.11043148281740879,
"learning_rate": 1.7180341579374244e-05,
"loss": 0.4873,
"step": 1997
},
{
"epoch": 0.9877641824249166,
"grad_norm": 0.11227279723985613,
"learning_rate": 1.7177630855122256e-05,
"loss": 0.499,
"step": 1998
},
{
"epoch": 0.9882585588925967,
"grad_norm": 0.11533986536141681,
"learning_rate": 1.7174919042569137e-05,
"loss": 0.4869,
"step": 1999
},
{
"epoch": 0.9887529353602769,
"grad_norm": 0.11340180543149539,
"learning_rate": 1.7172206142126068e-05,
"loss": 0.5171,
"step": 2000
},
{
"epoch": 0.989247311827957,
"grad_norm": 0.10395991542057438,
"learning_rate": 1.7169492154204385e-05,
"loss": 0.4315,
"step": 2001
},
{
"epoch": 0.9897416882956371,
"grad_norm": 0.11910256853304534,
"learning_rate": 1.7166777079215595e-05,
"loss": 0.495,
"step": 2002
},
{
"epoch": 0.9902360647633173,
"grad_norm": 0.10520560837818187,
"learning_rate": 1.716406091757137e-05,
"loss": 0.5109,
"step": 2003
},
{
"epoch": 0.9907304412309974,
"grad_norm": 0.12217799570211313,
"learning_rate": 1.7161343669683542e-05,
"loss": 0.4772,
"step": 2004
},
{
"epoch": 0.9912248176986775,
"grad_norm": 0.10961634880155714,
"learning_rate": 1.7158625335964116e-05,
"loss": 0.5098,
"step": 2005
},
{
"epoch": 0.9917191941663577,
"grad_norm": 0.11052404957612626,
"learning_rate": 1.7155905916825253e-05,
"loss": 0.5047,
"step": 2006
},
{
"epoch": 0.9922135706340378,
"grad_norm": 0.5954535959854729,
"learning_rate": 1.7153185412679283e-05,
"loss": 0.5089,
"step": 2007
},
{
"epoch": 0.992707947101718,
"grad_norm": 0.1052661342861684,
"learning_rate": 1.7150463823938702e-05,
"loss": 0.4691,
"step": 2008
},
{
"epoch": 0.9932023235693981,
"grad_norm": 0.1139760671117412,
"learning_rate": 1.7147741151016163e-05,
"loss": 0.5042,
"step": 2009
},
{
"epoch": 0.9936967000370782,
"grad_norm": 0.11557418228188314,
"learning_rate": 1.7145017394324495e-05,
"loss": 0.5048,
"step": 2010
},
{
"epoch": 0.9941910765047584,
"grad_norm": 0.11160827216541694,
"learning_rate": 1.7142292554276678e-05,
"loss": 0.4833,
"step": 2011
},
{
"epoch": 0.9946854529724385,
"grad_norm": 0.11504189291899675,
"learning_rate": 1.7139566631285868e-05,
"loss": 0.4738,
"step": 2012
},
{
"epoch": 0.9951798294401186,
"grad_norm": 0.10206672622048257,
"learning_rate": 1.713683962576538e-05,
"loss": 0.4936,
"step": 2013
},
{
"epoch": 0.9956742059077988,
"grad_norm": 7.941408426196434,
"learning_rate": 1.7134111538128694e-05,
"loss": 0.6602,
"step": 2014
},
{
"epoch": 0.9961685823754789,
"grad_norm": 0.15765193504394298,
"learning_rate": 1.713138236878945e-05,
"loss": 0.4873,
"step": 2015
},
{
"epoch": 0.996662958843159,
"grad_norm": 0.14765850051408538,
"learning_rate": 1.7128652118161458e-05,
"loss": 0.508,
"step": 2016
},
{
"epoch": 0.9971573353108392,
"grad_norm": 0.16376422036115976,
"learning_rate": 1.7125920786658688e-05,
"loss": 0.5233,
"step": 2017
},
{
"epoch": 0.9976517117785193,
"grad_norm": 0.18733945733669605,
"learning_rate": 1.712318837469528e-05,
"loss": 0.4923,
"step": 2018
},
{
"epoch": 0.9981460882461994,
"grad_norm": 0.13955796070348173,
"learning_rate": 1.712045488268553e-05,
"loss": 0.4894,
"step": 2019
},
{
"epoch": 0.9986404647138796,
"grad_norm": 0.17043801378393225,
"learning_rate": 1.7117720311043897e-05,
"loss": 0.5137,
"step": 2020
},
{
"epoch": 0.9991348411815597,
"grad_norm": 0.12044735910668829,
"learning_rate": 1.7114984660185015e-05,
"loss": 0.4967,
"step": 2021
},
{
"epoch": 0.9996292176492398,
"grad_norm": 0.12453225261506602,
"learning_rate": 1.711224793052367e-05,
"loss": 0.5025,
"step": 2022
},
{
"epoch": 1.0,
"grad_norm": 0.12453225261506602,
"learning_rate": 1.7109510122474818e-05,
"loss": 0.4948,
"step": 2023
},
{
"epoch": 1.0004943764676801,
"grad_norm": 0.18153424362802134,
"learning_rate": 1.710677123645357e-05,
"loss": 0.3915,
"step": 2024
},
{
"epoch": 1.0004943764676801,
"eval_loss": 0.5047250390052795,
"eval_runtime": 101.4478,
"eval_samples_per_second": 299.208,
"eval_steps_per_second": 37.408,
"step": 2024
},
{
"epoch": 1.0009887529353603,
"grad_norm": 0.1752769228311676,
"learning_rate": 1.710403127287522e-05,
"loss": 0.4216,
"step": 2025
},
{
"epoch": 1.0014831294030404,
"grad_norm": 0.16825047412678462,
"learning_rate": 1.71012902321552e-05,
"loss": 0.418,
"step": 2026
},
{
"epoch": 1.0019775058707205,
"grad_norm": 0.1965153094793514,
"learning_rate": 1.7098548114709122e-05,
"loss": 0.385,
"step": 2027
},
{
"epoch": 1.0024718823384007,
"grad_norm": 0.14915785611172594,
"learning_rate": 1.7095804920952758e-05,
"loss": 0.4117,
"step": 2028
},
{
"epoch": 1.0029662588060808,
"grad_norm": 0.1856966017159681,
"learning_rate": 1.7093060651302042e-05,
"loss": 0.4198,
"step": 2029
},
{
"epoch": 1.003460635273761,
"grad_norm": 0.15262124341047278,
"learning_rate": 1.7090315306173068e-05,
"loss": 0.3851,
"step": 2030
},
{
"epoch": 1.003955011741441,
"grad_norm": 0.16416214215525346,
"learning_rate": 1.70875688859821e-05,
"loss": 0.4196,
"step": 2031
},
{
"epoch": 1.0044493882091212,
"grad_norm": 0.15428788035902716,
"learning_rate": 1.7084821391145558e-05,
"loss": 0.396,
"step": 2032
},
{
"epoch": 1.0049437646768014,
"grad_norm": 0.14266072757090145,
"learning_rate": 1.708207282208003e-05,
"loss": 0.4014,
"step": 2033
},
{
"epoch": 1.0054381411444815,
"grad_norm": 0.14963536124097934,
"learning_rate": 1.7079323179202262e-05,
"loss": 0.4148,
"step": 2034
},
{
"epoch": 1.0059325176121616,
"grad_norm": 0.13667979860077523,
"learning_rate": 1.7076572462929173e-05,
"loss": 0.4053,
"step": 2035
},
{
"epoch": 1.0064268940798418,
"grad_norm": 0.13346171121326075,
"learning_rate": 1.7073820673677833e-05,
"loss": 0.4123,
"step": 2036
},
{
"epoch": 1.006921270547522,
"grad_norm": 0.14677826547223075,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.3573,
"step": 2037
},
{
"epoch": 1.007415647015202,
"grad_norm": 0.12638839824705547,
"learning_rate": 1.7068313877909507e-05,
"loss": 0.4174,
"step": 2038
},
{
"epoch": 1.0079100234828822,
"grad_norm": 0.1268291400955907,
"learning_rate": 1.7065558872227488e-05,
"loss": 0.4032,
"step": 2039
},
{
"epoch": 1.0084043999505623,
"grad_norm": 0.12760030552588353,
"learning_rate": 1.706280279523714e-05,
"loss": 0.4299,
"step": 2040
},
{
"epoch": 1.0088987764182424,
"grad_norm": 0.11359028318405515,
"learning_rate": 1.7060045647356357e-05,
"loss": 0.3916,
"step": 2041
},
{
"epoch": 1.0093931528859226,
"grad_norm": 0.17574925438139927,
"learning_rate": 1.7057287429003185e-05,
"loss": 0.3854,
"step": 2042
},
{
"epoch": 1.0098875293536027,
"grad_norm": 0.10815869501996768,
"learning_rate": 1.7054528140595835e-05,
"loss": 0.3833,
"step": 2043
},
{
"epoch": 1.0103819058212828,
"grad_norm": 0.11665932211746784,
"learning_rate": 1.705176778255268e-05,
"loss": 0.4406,
"step": 2044
},
{
"epoch": 1.010876282288963,
"grad_norm": 0.13562056481424306,
"learning_rate": 1.704900635529226e-05,
"loss": 0.3873,
"step": 2045
},
{
"epoch": 1.0113706587566431,
"grad_norm": 0.11811688002158244,
"learning_rate": 1.7046243859233275e-05,
"loss": 0.4219,
"step": 2046
},
{
"epoch": 1.0118650352243233,
"grad_norm": 0.1314175254952622,
"learning_rate": 1.7043480294794583e-05,
"loss": 0.4032,
"step": 2047
},
{
"epoch": 1.0123594116920034,
"grad_norm": 0.10451809687173723,
"learning_rate": 1.7040715662395207e-05,
"loss": 0.408,
"step": 2048
},
{
"epoch": 1.0128537881596835,
"grad_norm": 0.12039986122640173,
"learning_rate": 1.703794996245433e-05,
"loss": 0.3995,
"step": 2049
},
{
"epoch": 1.0133481646273637,
"grad_norm": 0.11939850735201106,
"learning_rate": 1.7035183195391303e-05,
"loss": 0.4061,
"step": 2050
},
{
"epoch": 1.0138425410950438,
"grad_norm": 0.11438785851552596,
"learning_rate": 1.703241536162563e-05,
"loss": 0.3931,
"step": 2051
},
{
"epoch": 1.014336917562724,
"grad_norm": 0.11832903929068252,
"learning_rate": 1.7029646461576984e-05,
"loss": 0.405,
"step": 2052
},
{
"epoch": 1.014831294030404,
"grad_norm": 0.11542867636819672,
"learning_rate": 1.7026876495665194e-05,
"loss": 0.3771,
"step": 2053
},
{
"epoch": 1.0153256704980842,
"grad_norm": 0.12689059841113914,
"learning_rate": 1.7024105464310257e-05,
"loss": 0.3688,
"step": 2054
},
{
"epoch": 1.0158200469657643,
"grad_norm": 0.10974275104630789,
"learning_rate": 1.702133336793232e-05,
"loss": 0.4052,
"step": 2055
},
{
"epoch": 1.0163144234334445,
"grad_norm": 0.14853281046677955,
"learning_rate": 1.701856020695171e-05,
"loss": 0.4038,
"step": 2056
},
{
"epoch": 1.0168087999011246,
"grad_norm": 0.11787405388382236,
"learning_rate": 1.7015785981788902e-05,
"loss": 0.4119,
"step": 2057
},
{
"epoch": 1.0173031763688047,
"grad_norm": 0.12004911145466779,
"learning_rate": 1.7013010692864527e-05,
"loss": 0.4172,
"step": 2058
},
{
"epoch": 1.0177975528364849,
"grad_norm": 0.1325699359776669,
"learning_rate": 1.7010234340599393e-05,
"loss": 0.4041,
"step": 2059
},
{
"epoch": 1.0182919293041652,
"grad_norm": 0.11332249128259655,
"learning_rate": 1.700745692541446e-05,
"loss": 0.3872,
"step": 2060
},
{
"epoch": 1.0187863057718454,
"grad_norm": 0.11744735328741167,
"learning_rate": 1.700467844773085e-05,
"loss": 0.3946,
"step": 2061
},
{
"epoch": 1.0192806822395255,
"grad_norm": 0.10783253001785399,
"learning_rate": 1.700189890796985e-05,
"loss": 0.4331,
"step": 2062
},
{
"epoch": 1.0197750587072056,
"grad_norm": 0.1229652879765003,
"learning_rate": 1.6999118306552903e-05,
"loss": 0.3927,
"step": 2063
},
{
"epoch": 1.0202694351748858,
"grad_norm": 0.11578124244555453,
"learning_rate": 1.6996336643901613e-05,
"loss": 0.4209,
"step": 2064
},
{
"epoch": 1.020763811642566,
"grad_norm": 0.12599250607368692,
"learning_rate": 1.6993553920437745e-05,
"loss": 0.4566,
"step": 2065
},
{
"epoch": 1.021258188110246,
"grad_norm": 0.1271328001962951,
"learning_rate": 1.6990770136583234e-05,
"loss": 0.4167,
"step": 2066
},
{
"epoch": 1.0217525645779262,
"grad_norm": 0.1212883635205335,
"learning_rate": 1.6987985292760163e-05,
"loss": 0.408,
"step": 2067
},
{
"epoch": 1.0222469410456063,
"grad_norm": 0.11020780105865138,
"learning_rate": 1.6985199389390782e-05,
"loss": 0.384,
"step": 2068
},
{
"epoch": 1.0227413175132865,
"grad_norm": 0.1302510847215587,
"learning_rate": 1.6982412426897505e-05,
"loss": 0.4234,
"step": 2069
},
{
"epoch": 1.0232356939809666,
"grad_norm": 0.11382801320543301,
"learning_rate": 1.6979624405702895e-05,
"loss": 0.3886,
"step": 2070
},
{
"epoch": 1.0237300704486467,
"grad_norm": 0.13530971737003483,
"learning_rate": 1.697683532622969e-05,
"loss": 0.3899,
"step": 2071
},
{
"epoch": 1.0242244469163269,
"grad_norm": 0.12412237464326971,
"learning_rate": 1.6974045188900775e-05,
"loss": 0.3936,
"step": 2072
},
{
"epoch": 1.024718823384007,
"grad_norm": 0.10637871849533896,
"learning_rate": 1.6971253994139205e-05,
"loss": 0.4068,
"step": 2073
},
{
"epoch": 1.0252131998516871,
"grad_norm": 0.13091695201980175,
"learning_rate": 1.696846174236819e-05,
"loss": 0.4169,
"step": 2074
},
{
"epoch": 1.0257075763193673,
"grad_norm": 0.14751439805776972,
"learning_rate": 1.6965668434011112e-05,
"loss": 0.4001,
"step": 2075
},
{
"epoch": 1.0262019527870474,
"grad_norm": 0.11660745604774768,
"learning_rate": 1.696287406949149e-05,
"loss": 0.3792,
"step": 2076
},
{
"epoch": 1.0266963292547275,
"grad_norm": 0.11250216980437024,
"learning_rate": 1.6960078649233024e-05,
"loss": 0.3863,
"step": 2077
},
{
"epoch": 1.0271907057224077,
"grad_norm": 0.11003409309363488,
"learning_rate": 1.6957282173659567e-05,
"loss": 0.4097,
"step": 2078
},
{
"epoch": 1.0276850821900878,
"grad_norm": 0.11269296902525683,
"learning_rate": 1.695448464319513e-05,
"loss": 0.3982,
"step": 2079
},
{
"epoch": 1.028179458657768,
"grad_norm": 0.1111884975977521,
"learning_rate": 1.6951686058263884e-05,
"loss": 0.4124,
"step": 2080
},
{
"epoch": 1.028673835125448,
"grad_norm": 0.11338902926661874,
"learning_rate": 1.6948886419290167e-05,
"loss": 0.3854,
"step": 2081
},
{
"epoch": 1.0291682115931282,
"grad_norm": 0.10341342002625768,
"learning_rate": 1.6946085726698465e-05,
"loss": 0.3989,
"step": 2082
},
{
"epoch": 1.0296625880608083,
"grad_norm": 0.1148845574463567,
"learning_rate": 1.6943283980913438e-05,
"loss": 0.4099,
"step": 2083
},
{
"epoch": 1.0301569645284885,
"grad_norm": 0.14666095710708882,
"learning_rate": 1.694048118235989e-05,
"loss": 0.3742,
"step": 2084
},
{
"epoch": 1.0306513409961686,
"grad_norm": 0.10449883641320895,
"learning_rate": 1.6937677331462796e-05,
"loss": 0.3951,
"step": 2085
},
{
"epoch": 1.0311457174638488,
"grad_norm": 0.1135724825450342,
"learning_rate": 1.693487242864729e-05,
"loss": 0.38,
"step": 2086
},
{
"epoch": 1.031640093931529,
"grad_norm": 0.11102460521480473,
"learning_rate": 1.6932066474338658e-05,
"loss": 0.4082,
"step": 2087
},
{
"epoch": 1.032134470399209,
"grad_norm": 0.11086988431357646,
"learning_rate": 1.692925946896235e-05,
"loss": 0.418,
"step": 2088
},
{
"epoch": 1.0326288468668892,
"grad_norm": 0.11477116921168398,
"learning_rate": 1.6926451412943982e-05,
"loss": 0.4149,
"step": 2089
},
{
"epoch": 1.0331232233345693,
"grad_norm": 0.12991338299448496,
"learning_rate": 1.6923642306709312e-05,
"loss": 0.3989,
"step": 2090
},
{
"epoch": 1.0336175998022494,
"grad_norm": 0.11424185881521744,
"learning_rate": 1.6920832150684278e-05,
"loss": 0.3828,
"step": 2091
},
{
"epoch": 1.0341119762699296,
"grad_norm": 0.11874853776885116,
"learning_rate": 1.691802094529496e-05,
"loss": 0.4295,
"step": 2092
},
{
"epoch": 1.0346063527376097,
"grad_norm": 0.10954062409423658,
"learning_rate": 1.6915208690967607e-05,
"loss": 0.4283,
"step": 2093
},
{
"epoch": 1.0351007292052898,
"grad_norm": 0.11339910777003834,
"learning_rate": 1.6912395388128627e-05,
"loss": 0.4072,
"step": 2094
},
{
"epoch": 1.03559510567297,
"grad_norm": 0.1201630223609851,
"learning_rate": 1.690958103720458e-05,
"loss": 0.4153,
"step": 2095
},
{
"epoch": 1.03608948214065,
"grad_norm": 0.11914727828917349,
"learning_rate": 1.690676563862219e-05,
"loss": 0.402,
"step": 2096
},
{
"epoch": 1.0365838586083302,
"grad_norm": 0.11911604385767499,
"learning_rate": 1.690394919280834e-05,
"loss": 0.4748,
"step": 2097
},
{
"epoch": 1.0370782350760104,
"grad_norm": 0.11827352549989398,
"learning_rate": 1.6901131700190073e-05,
"loss": 0.4086,
"step": 2098
},
{
"epoch": 1.0375726115436905,
"grad_norm": 0.12083490576936849,
"learning_rate": 1.6898313161194584e-05,
"loss": 0.3894,
"step": 2099
},
{
"epoch": 1.0380669880113707,
"grad_norm": 0.11600285013630683,
"learning_rate": 1.6895493576249235e-05,
"loss": 0.413,
"step": 2100
},
{
"epoch": 1.0385613644790508,
"grad_norm": 0.12169988145248381,
"learning_rate": 1.6892672945781537e-05,
"loss": 0.4116,
"step": 2101
},
{
"epoch": 1.039055740946731,
"grad_norm": 0.12006418116755659,
"learning_rate": 1.6889851270219172e-05,
"loss": 0.4015,
"step": 2102
},
{
"epoch": 1.039550117414411,
"grad_norm": 0.11898989546864747,
"learning_rate": 1.688702854998997e-05,
"loss": 0.4208,
"step": 2103
},
{
"epoch": 1.0400444938820912,
"grad_norm": 0.13119342359454345,
"learning_rate": 1.6884204785521924e-05,
"loss": 0.397,
"step": 2104
},
{
"epoch": 1.0405388703497713,
"grad_norm": 0.12226183009022484,
"learning_rate": 1.688137997724319e-05,
"loss": 0.4003,
"step": 2105
},
{
"epoch": 1.0410332468174515,
"grad_norm": 0.11811475209066297,
"learning_rate": 1.6878554125582066e-05,
"loss": 0.4137,
"step": 2106
},
{
"epoch": 1.0415276232851316,
"grad_norm": 0.1169168824914463,
"learning_rate": 1.6875727230967025e-05,
"loss": 0.3822,
"step": 2107
},
{
"epoch": 1.0420219997528117,
"grad_norm": 0.1234110085657826,
"learning_rate": 1.687289929382669e-05,
"loss": 0.4266,
"step": 2108
},
{
"epoch": 1.0425163762204919,
"grad_norm": 0.11962726671781579,
"learning_rate": 1.6870070314589847e-05,
"loss": 0.4218,
"step": 2109
},
{
"epoch": 1.043010752688172,
"grad_norm": 0.12213079721862383,
"learning_rate": 1.6867240293685435e-05,
"loss": 0.3944,
"step": 2110
},
{
"epoch": 1.0435051291558521,
"grad_norm": 0.11244361597554325,
"learning_rate": 1.6864409231542557e-05,
"loss": 0.4077,
"step": 2111
},
{
"epoch": 1.0439995056235323,
"grad_norm": 0.10993343311364137,
"learning_rate": 1.6861577128590465e-05,
"loss": 0.3788,
"step": 2112
},
{
"epoch": 1.0444938820912124,
"grad_norm": 0.10923043873550811,
"learning_rate": 1.6858743985258573e-05,
"loss": 0.4007,
"step": 2113
},
{
"epoch": 1.0449882585588925,
"grad_norm": 0.11866797892679883,
"learning_rate": 1.685590980197646e-05,
"loss": 0.4167,
"step": 2114
},
{
"epoch": 1.0454826350265727,
"grad_norm": 0.12364578875490424,
"learning_rate": 1.685307457917385e-05,
"loss": 0.3957,
"step": 2115
},
{
"epoch": 1.0459770114942528,
"grad_norm": 0.10854843053852588,
"learning_rate": 1.6850238317280633e-05,
"loss": 0.4104,
"step": 2116
},
{
"epoch": 1.046471387961933,
"grad_norm": 0.12834643212755784,
"learning_rate": 1.6847401016726858e-05,
"loss": 0.3971,
"step": 2117
},
{
"epoch": 1.046965764429613,
"grad_norm": 0.11739371740900013,
"learning_rate": 1.684456267794272e-05,
"loss": 0.4426,
"step": 2118
},
{
"epoch": 1.0474601408972932,
"grad_norm": 0.1313221042885495,
"learning_rate": 1.6841723301358586e-05,
"loss": 0.4054,
"step": 2119
},
{
"epoch": 1.0479545173649734,
"grad_norm": 0.12414767711752121,
"learning_rate": 1.683888288740497e-05,
"loss": 0.3899,
"step": 2120
},
{
"epoch": 1.0484488938326535,
"grad_norm": 0.11447935040661719,
"learning_rate": 1.683604143651255e-05,
"loss": 0.4026,
"step": 2121
},
{
"epoch": 1.0489432703003336,
"grad_norm": 0.12334377584754935,
"learning_rate": 1.6833198949112155e-05,
"loss": 0.4113,
"step": 2122
},
{
"epoch": 1.0494376467680138,
"grad_norm": 0.11395667179990795,
"learning_rate": 1.6830355425634775e-05,
"loss": 0.3717,
"step": 2123
},
{
"epoch": 1.049932023235694,
"grad_norm": 0.12024044971722216,
"learning_rate": 1.682751086651156e-05,
"loss": 0.4101,
"step": 2124
},
{
"epoch": 1.050426399703374,
"grad_norm": 0.11476488071097693,
"learning_rate": 1.6824665272173806e-05,
"loss": 0.4129,
"step": 2125
},
{
"epoch": 1.0509207761710542,
"grad_norm": 0.12380805998555132,
"learning_rate": 1.682181864305298e-05,
"loss": 0.4243,
"step": 2126
},
{
"epoch": 1.0514151526387343,
"grad_norm": 0.12034650698045578,
"learning_rate": 1.68189709795807e-05,
"loss": 0.4146,
"step": 2127
},
{
"epoch": 1.0519095291064144,
"grad_norm": 0.11134990221931472,
"learning_rate": 1.6816122282188735e-05,
"loss": 0.4023,
"step": 2128
},
{
"epoch": 1.0524039055740946,
"grad_norm": 0.1270041728341491,
"learning_rate": 1.681327255130902e-05,
"loss": 0.4057,
"step": 2129
},
{
"epoch": 1.0528982820417747,
"grad_norm": 0.12055909373004821,
"learning_rate": 1.6810421787373644e-05,
"loss": 0.4076,
"step": 2130
},
{
"epoch": 1.0533926585094548,
"grad_norm": 0.11683084829710798,
"learning_rate": 1.6807569990814842e-05,
"loss": 0.4078,
"step": 2131
},
{
"epoch": 1.053887034977135,
"grad_norm": 0.10608619363958134,
"learning_rate": 1.680471716206502e-05,
"loss": 0.3787,
"step": 2132
},
{
"epoch": 1.0543814114448151,
"grad_norm": 0.10400878286680257,
"learning_rate": 1.680186330155674e-05,
"loss": 0.4092,
"step": 2133
},
{
"epoch": 1.0548757879124953,
"grad_norm": 0.11109097946667915,
"learning_rate": 1.6799008409722713e-05,
"loss": 0.4039,
"step": 2134
},
{
"epoch": 1.0553701643801756,
"grad_norm": 0.11546250039758871,
"learning_rate": 1.679615248699581e-05,
"loss": 0.4103,
"step": 2135
},
{
"epoch": 1.0558645408478557,
"grad_norm": 0.11353171521048112,
"learning_rate": 1.6793295533809054e-05,
"loss": 0.409,
"step": 2136
},
{
"epoch": 1.0563589173155359,
"grad_norm": 0.11375729750499433,
"learning_rate": 1.6790437550595626e-05,
"loss": 0.3991,
"step": 2137
},
{
"epoch": 1.056853293783216,
"grad_norm": 0.11108320014305491,
"learning_rate": 1.678757853778887e-05,
"loss": 0.4274,
"step": 2138
},
{
"epoch": 1.0573476702508962,
"grad_norm": 0.11870551399325992,
"learning_rate": 1.678471849582228e-05,
"loss": 0.4172,
"step": 2139
},
{
"epoch": 1.0578420467185763,
"grad_norm": 0.11702126306498127,
"learning_rate": 1.6781857425129507e-05,
"loss": 0.3961,
"step": 2140
},
{
"epoch": 1.0583364231862564,
"grad_norm": 0.10632551924318828,
"learning_rate": 1.6778995326144354e-05,
"loss": 0.3979,
"step": 2141
},
{
"epoch": 1.0588307996539366,
"grad_norm": 0.1165240021633167,
"learning_rate": 1.6776132199300792e-05,
"loss": 0.4077,
"step": 2142
},
{
"epoch": 1.0593251761216167,
"grad_norm": 0.11198346954929352,
"learning_rate": 1.677326804503293e-05,
"loss": 0.4091,
"step": 2143
},
{
"epoch": 1.0598195525892968,
"grad_norm": 0.11073549927738564,
"learning_rate": 1.677040286377505e-05,
"loss": 0.412,
"step": 2144
},
{
"epoch": 1.060313929056977,
"grad_norm": 0.1204491999143969,
"learning_rate": 1.676753665596158e-05,
"loss": 0.4143,
"step": 2145
},
{
"epoch": 1.060808305524657,
"grad_norm": 0.11773244841133315,
"learning_rate": 1.6764669422027106e-05,
"loss": 0.3901,
"step": 2146
},
{
"epoch": 1.0613026819923372,
"grad_norm": 0.1136051267158179,
"learning_rate": 1.676180116240637e-05,
"loss": 0.4245,
"step": 2147
},
{
"epoch": 1.0617970584600174,
"grad_norm": 0.11683870623046365,
"learning_rate": 1.6758931877534263e-05,
"loss": 0.4137,
"step": 2148
},
{
"epoch": 1.0622914349276975,
"grad_norm": 0.10621653612934304,
"learning_rate": 1.6756061567845843e-05,
"loss": 0.3773,
"step": 2149
},
{
"epoch": 1.0627858113953776,
"grad_norm": 0.11503299493356486,
"learning_rate": 1.6753190233776323e-05,
"loss": 0.4002,
"step": 2150
},
{
"epoch": 1.0632801878630578,
"grad_norm": 0.11122741092297105,
"learning_rate": 1.6750317875761057e-05,
"loss": 0.4054,
"step": 2151
},
{
"epoch": 1.063774564330738,
"grad_norm": 0.11822984949764032,
"learning_rate": 1.6747444494235565e-05,
"loss": 0.3935,
"step": 2152
},
{
"epoch": 1.064268940798418,
"grad_norm": 0.10470970866030703,
"learning_rate": 1.6744570089635527e-05,
"loss": 0.3998,
"step": 2153
},
{
"epoch": 1.0647633172660982,
"grad_norm": 0.11315178878082581,
"learning_rate": 1.6741694662396763e-05,
"loss": 0.3951,
"step": 2154
},
{
"epoch": 1.0652576937337783,
"grad_norm": 0.10585113697846772,
"learning_rate": 1.6738818212955263e-05,
"loss": 0.4054,
"step": 2155
},
{
"epoch": 1.0657520702014585,
"grad_norm": 0.11100832508175573,
"learning_rate": 1.673594074174716e-05,
"loss": 0.4111,
"step": 2156
},
{
"epoch": 1.0662464466691386,
"grad_norm": 0.11634650043849763,
"learning_rate": 1.673306224920876e-05,
"loss": 0.3892,
"step": 2157
},
{
"epoch": 1.0667408231368187,
"grad_norm": 0.11557100714294805,
"learning_rate": 1.67301827357765e-05,
"loss": 0.385,
"step": 2158
},
{
"epoch": 1.0672351996044989,
"grad_norm": 0.1204857852123548,
"learning_rate": 1.6727302201886986e-05,
"loss": 0.4142,
"step": 2159
},
{
"epoch": 1.067729576072179,
"grad_norm": 0.11147177191487864,
"learning_rate": 1.6724420647976972e-05,
"loss": 0.3996,
"step": 2160
},
{
"epoch": 1.0682239525398591,
"grad_norm": 0.11207876185378114,
"learning_rate": 1.6721538074483385e-05,
"loss": 0.3871,
"step": 2161
},
{
"epoch": 1.0687183290075393,
"grad_norm": 0.10859895582760695,
"learning_rate": 1.671865448184327e-05,
"loss": 0.4096,
"step": 2162
},
{
"epoch": 1.0692127054752194,
"grad_norm": 0.10929215236868707,
"learning_rate": 1.671576987049387e-05,
"loss": 0.3899,
"step": 2163
},
{
"epoch": 1.0697070819428995,
"grad_norm": 0.11481653608179412,
"learning_rate": 1.6712884240872555e-05,
"loss": 0.3881,
"step": 2164
},
{
"epoch": 1.0702014584105797,
"grad_norm": 0.1258275450096322,
"learning_rate": 1.670999759341685e-05,
"loss": 0.3916,
"step": 2165
},
{
"epoch": 1.0706958348782598,
"grad_norm": 0.10911497980432523,
"learning_rate": 1.670710992856444e-05,
"loss": 0.3876,
"step": 2166
},
{
"epoch": 1.07119021134594,
"grad_norm": 0.12370498606312262,
"learning_rate": 1.6704221246753172e-05,
"loss": 0.3919,
"step": 2167
},
{
"epoch": 1.07168458781362,
"grad_norm": 0.1174136243329756,
"learning_rate": 1.6701331548421037e-05,
"loss": 0.4217,
"step": 2168
},
{
"epoch": 1.0721789642813002,
"grad_norm": 0.11218691228289837,
"learning_rate": 1.669844083400618e-05,
"loss": 0.4125,
"step": 2169
},
{
"epoch": 1.0726733407489804,
"grad_norm": 0.10913815366814034,
"learning_rate": 1.66955491039469e-05,
"loss": 0.3843,
"step": 2170
},
{
"epoch": 1.0731677172166605,
"grad_norm": 0.11127156262058704,
"learning_rate": 1.669265635868166e-05,
"loss": 0.4381,
"step": 2171
},
{
"epoch": 1.0736620936843406,
"grad_norm": 0.14028299496136484,
"learning_rate": 1.6689762598649063e-05,
"loss": 0.4075,
"step": 2172
},
{
"epoch": 1.0741564701520208,
"grad_norm": 0.11229280606603768,
"learning_rate": 1.6686867824287877e-05,
"loss": 0.416,
"step": 2173
},
{
"epoch": 1.074650846619701,
"grad_norm": 0.11911154032736639,
"learning_rate": 1.668397203603702e-05,
"loss": 0.4067,
"step": 2174
},
{
"epoch": 1.075145223087381,
"grad_norm": 0.11483868025404466,
"learning_rate": 1.668107523433556e-05,
"loss": 0.4148,
"step": 2175
},
{
"epoch": 1.0756395995550612,
"grad_norm": 0.11030321202521919,
"learning_rate": 1.667817741962272e-05,
"loss": 0.4168,
"step": 2176
},
{
"epoch": 1.0761339760227413,
"grad_norm": 0.11357312031756996,
"learning_rate": 1.6675278592337885e-05,
"loss": 0.4244,
"step": 2177
},
{
"epoch": 1.0766283524904214,
"grad_norm": 0.10921996573887986,
"learning_rate": 1.6672378752920576e-05,
"loss": 0.4276,
"step": 2178
},
{
"epoch": 1.0771227289581016,
"grad_norm": 0.1188492925365669,
"learning_rate": 1.666947790181049e-05,
"loss": 0.4006,
"step": 2179
},
{
"epoch": 1.0776171054257817,
"grad_norm": 0.11569774250611278,
"learning_rate": 1.666657603944746e-05,
"loss": 0.4141,
"step": 2180
},
{
"epoch": 1.0781114818934618,
"grad_norm": 0.11037118560457505,
"learning_rate": 1.666367316627148e-05,
"loss": 0.3774,
"step": 2181
},
{
"epoch": 1.078605858361142,
"grad_norm": 0.11882932149596917,
"learning_rate": 1.6660769282722688e-05,
"loss": 0.4031,
"step": 2182
},
{
"epoch": 1.0791002348288221,
"grad_norm": 0.1058184727713352,
"learning_rate": 1.6657864389241397e-05,
"loss": 0.4164,
"step": 2183
},
{
"epoch": 1.0795946112965022,
"grad_norm": 0.1079788968503062,
"learning_rate": 1.665495848626804e-05,
"loss": 0.4033,
"step": 2184
},
{
"epoch": 1.0800889877641824,
"grad_norm": 0.11271508186931825,
"learning_rate": 1.6652051574243237e-05,
"loss": 0.3985,
"step": 2185
},
{
"epoch": 1.0805833642318625,
"grad_norm": 0.10260276546131321,
"learning_rate": 1.6649143653607736e-05,
"loss": 0.4271,
"step": 2186
},
{
"epoch": 1.0810777406995427,
"grad_norm": 0.13296850829502121,
"learning_rate": 1.664623472480246e-05,
"loss": 0.3918,
"step": 2187
},
{
"epoch": 1.0815721171672228,
"grad_norm": 0.11976589627743114,
"learning_rate": 1.6643324788268457e-05,
"loss": 0.4211,
"step": 2188
},
{
"epoch": 1.082066493634903,
"grad_norm": 0.1143749911207797,
"learning_rate": 1.664041384444695e-05,
"loss": 0.4129,
"step": 2189
},
{
"epoch": 1.082560870102583,
"grad_norm": 0.11937458597449709,
"learning_rate": 1.663750189377931e-05,
"loss": 0.4046,
"step": 2190
},
{
"epoch": 1.0830552465702632,
"grad_norm": 0.12292797023170124,
"learning_rate": 1.663458893670706e-05,
"loss": 0.4135,
"step": 2191
},
{
"epoch": 1.0835496230379433,
"grad_norm": 0.11187765796020867,
"learning_rate": 1.663167497367187e-05,
"loss": 0.4227,
"step": 2192
},
{
"epoch": 1.0840439995056235,
"grad_norm": 0.114981261773038,
"learning_rate": 1.6628760005115564e-05,
"loss": 0.3998,
"step": 2193
},
{
"epoch": 1.0845383759733036,
"grad_norm": 0.11224497429227054,
"learning_rate": 1.6625844031480128e-05,
"loss": 0.4005,
"step": 2194
},
{
"epoch": 1.0850327524409837,
"grad_norm": 0.11904830385129021,
"learning_rate": 1.6622927053207686e-05,
"loss": 0.4255,
"step": 2195
},
{
"epoch": 1.0855271289086639,
"grad_norm": 0.1063153871990056,
"learning_rate": 1.6620009070740534e-05,
"loss": 0.4204,
"step": 2196
},
{
"epoch": 1.086021505376344,
"grad_norm": 0.11496033729761301,
"learning_rate": 1.6617090084521094e-05,
"loss": 0.3961,
"step": 2197
},
{
"epoch": 1.0865158818440241,
"grad_norm": 0.11680444461223759,
"learning_rate": 1.6614170094991962e-05,
"loss": 0.3829,
"step": 2198
},
{
"epoch": 1.0870102583117043,
"grad_norm": 0.10157883187635595,
"learning_rate": 1.661124910259588e-05,
"loss": 0.3781,
"step": 2199
},
{
"epoch": 1.0875046347793844,
"grad_norm": 0.11579066944807076,
"learning_rate": 1.660832710777574e-05,
"loss": 0.3877,
"step": 2200
},
{
"epoch": 1.0879990112470646,
"grad_norm": 0.11161258705720567,
"learning_rate": 1.660540411097458e-05,
"loss": 0.3964,
"step": 2201
},
{
"epoch": 1.0884933877147447,
"grad_norm": 0.1152660069497564,
"learning_rate": 1.6602480112635606e-05,
"loss": 0.3939,
"step": 2202
},
{
"epoch": 1.0889877641824248,
"grad_norm": 0.11174715540219592,
"learning_rate": 1.659955511320216e-05,
"loss": 0.3868,
"step": 2203
},
{
"epoch": 1.089482140650105,
"grad_norm": 0.11321230356061432,
"learning_rate": 1.6596629113117742e-05,
"loss": 0.4046,
"step": 2204
},
{
"epoch": 1.089976517117785,
"grad_norm": 0.12512097713564796,
"learning_rate": 1.6593702112826004e-05,
"loss": 0.4001,
"step": 2205
},
{
"epoch": 1.0904708935854652,
"grad_norm": 0.10497549073264655,
"learning_rate": 1.6590774112770754e-05,
"loss": 0.3926,
"step": 2206
},
{
"epoch": 1.0909652700531454,
"grad_norm": 0.12297916884705463,
"learning_rate": 1.6587845113395943e-05,
"loss": 0.3901,
"step": 2207
},
{
"epoch": 1.0914596465208257,
"grad_norm": 0.11077527828506713,
"learning_rate": 1.6584915115145678e-05,
"loss": 0.425,
"step": 2208
},
{
"epoch": 1.0919540229885056,
"grad_norm": 0.11296480996830785,
"learning_rate": 1.658198411846422e-05,
"loss": 0.394,
"step": 2209
},
{
"epoch": 1.092448399456186,
"grad_norm": 0.10947202231524285,
"learning_rate": 1.6579052123795977e-05,
"loss": 0.4257,
"step": 2210
},
{
"epoch": 1.0929427759238661,
"grad_norm": 0.12184109194659867,
"learning_rate": 1.6576119131585505e-05,
"loss": 0.4035,
"step": 2211
},
{
"epoch": 1.0934371523915463,
"grad_norm": 0.11317615416518913,
"learning_rate": 1.6573185142277525e-05,
"loss": 0.3921,
"step": 2212
},
{
"epoch": 1.0939315288592264,
"grad_norm": 0.12117421028974514,
"learning_rate": 1.657025015631689e-05,
"loss": 0.3907,
"step": 2213
},
{
"epoch": 1.0944259053269065,
"grad_norm": 0.10982990281705682,
"learning_rate": 1.656731417414862e-05,
"loss": 0.3811,
"step": 2214
},
{
"epoch": 1.0949202817945867,
"grad_norm": 0.11498136790970918,
"learning_rate": 1.6564377196217883e-05,
"loss": 0.4199,
"step": 2215
},
{
"epoch": 1.0954146582622668,
"grad_norm": 0.10980739362654905,
"learning_rate": 1.6561439222969992e-05,
"loss": 0.3965,
"step": 2216
},
{
"epoch": 1.095909034729947,
"grad_norm": 0.10800635066994689,
"learning_rate": 1.6558500254850412e-05,
"loss": 0.4072,
"step": 2217
},
{
"epoch": 1.096403411197627,
"grad_norm": 0.11767123349576289,
"learning_rate": 1.6555560292304767e-05,
"loss": 0.4088,
"step": 2218
},
{
"epoch": 1.0968977876653072,
"grad_norm": 0.1338248931979977,
"learning_rate": 1.6552619335778822e-05,
"loss": 0.4032,
"step": 2219
},
{
"epoch": 1.0973921641329873,
"grad_norm": 0.11184482940752391,
"learning_rate": 1.6549677385718498e-05,
"loss": 0.3998,
"step": 2220
},
{
"epoch": 1.0978865406006675,
"grad_norm": 0.11998220733774721,
"learning_rate": 1.654673444256986e-05,
"loss": 0.3975,
"step": 2221
},
{
"epoch": 1.0983809170683476,
"grad_norm": 0.1107141913216501,
"learning_rate": 1.6543790506779136e-05,
"loss": 0.4292,
"step": 2222
},
{
"epoch": 1.0988752935360278,
"grad_norm": 0.11579837170330415,
"learning_rate": 1.6540845578792692e-05,
"loss": 0.3979,
"step": 2223
},
{
"epoch": 1.0993696700037079,
"grad_norm": 0.1180545537637998,
"learning_rate": 1.6537899659057058e-05,
"loss": 0.4117,
"step": 2224
},
{
"epoch": 1.099864046471388,
"grad_norm": 0.1064858265890263,
"learning_rate": 1.6534952748018894e-05,
"loss": 0.4137,
"step": 2225
},
{
"epoch": 1.1003584229390682,
"grad_norm": 0.13291749305498185,
"learning_rate": 1.6532004846125034e-05,
"loss": 0.4065,
"step": 2226
},
{
"epoch": 1.1008527994067483,
"grad_norm": 0.11587130633406197,
"learning_rate": 1.652905595382244e-05,
"loss": 0.3852,
"step": 2227
},
{
"epoch": 1.1013471758744284,
"grad_norm": 0.1197762927342482,
"learning_rate": 1.652610607155825e-05,
"loss": 0.3901,
"step": 2228
},
{
"epoch": 1.1018415523421086,
"grad_norm": 0.11632708426627318,
"learning_rate": 1.6523155199779722e-05,
"loss": 0.4074,
"step": 2229
},
{
"epoch": 1.1023359288097887,
"grad_norm": 0.12497136363302573,
"learning_rate": 1.652020333893428e-05,
"loss": 0.4438,
"step": 2230
},
{
"epoch": 1.1028303052774688,
"grad_norm": 0.12340386843469942,
"learning_rate": 1.6517250489469507e-05,
"loss": 0.4254,
"step": 2231
},
{
"epoch": 1.103324681745149,
"grad_norm": 0.10911012157859583,
"learning_rate": 1.6514296651833118e-05,
"loss": 0.3993,
"step": 2232
},
{
"epoch": 1.103819058212829,
"grad_norm": 0.1137737389675077,
"learning_rate": 1.651134182647299e-05,
"loss": 0.3789,
"step": 2233
},
{
"epoch": 1.1043134346805092,
"grad_norm": 0.11018809134032267,
"learning_rate": 1.650838601383714e-05,
"loss": 0.379,
"step": 2234
},
{
"epoch": 1.1048078111481894,
"grad_norm": 0.10996933529505634,
"learning_rate": 1.6505429214373748e-05,
"loss": 0.3943,
"step": 2235
},
{
"epoch": 1.1053021876158695,
"grad_norm": 0.11880045460921522,
"learning_rate": 1.650247142853113e-05,
"loss": 0.4244,
"step": 2236
},
{
"epoch": 1.1057965640835496,
"grad_norm": 0.11386153088735446,
"learning_rate": 1.6499512656757756e-05,
"loss": 0.3925,
"step": 2237
},
{
"epoch": 1.1062909405512298,
"grad_norm": 0.11261577953843618,
"learning_rate": 1.649655289950225e-05,
"loss": 0.4299,
"step": 2238
},
{
"epoch": 1.10678531701891,
"grad_norm": 0.11863218866861588,
"learning_rate": 1.6493592157213383e-05,
"loss": 0.4226,
"step": 2239
},
{
"epoch": 1.10727969348659,
"grad_norm": 0.27179957523989406,
"learning_rate": 1.6490630430340072e-05,
"loss": 0.43,
"step": 2240
},
{
"epoch": 1.1077740699542702,
"grad_norm": 0.1199435735148248,
"learning_rate": 1.648766771933139e-05,
"loss": 0.3807,
"step": 2241
},
{
"epoch": 1.1082684464219503,
"grad_norm": 0.12158640572884882,
"learning_rate": 1.6484704024636552e-05,
"loss": 0.4518,
"step": 2242
},
{
"epoch": 1.1087628228896305,
"grad_norm": 0.10928110388373147,
"learning_rate": 1.6481739346704922e-05,
"loss": 0.3797,
"step": 2243
},
{
"epoch": 1.1092571993573106,
"grad_norm": 0.11675732365810937,
"learning_rate": 1.6478773685986022e-05,
"loss": 0.4098,
"step": 2244
},
{
"epoch": 1.1097515758249907,
"grad_norm": 0.12220792112018723,
"learning_rate": 1.6475807042929515e-05,
"loss": 0.3994,
"step": 2245
},
{
"epoch": 1.1102459522926709,
"grad_norm": 0.11243538774293758,
"learning_rate": 1.6472839417985216e-05,
"loss": 0.396,
"step": 2246
},
{
"epoch": 1.110740328760351,
"grad_norm": 0.11466996522674733,
"learning_rate": 1.6469870811603085e-05,
"loss": 0.3949,
"step": 2247
},
{
"epoch": 1.1112347052280311,
"grad_norm": 0.11707208716327738,
"learning_rate": 1.646690122423324e-05,
"loss": 0.3993,
"step": 2248
},
{
"epoch": 1.1117290816957113,
"grad_norm": 0.11357760102085576,
"learning_rate": 1.6463930656325938e-05,
"loss": 0.396,
"step": 2249
},
{
"epoch": 1.1122234581633914,
"grad_norm": 0.11133293042181042,
"learning_rate": 1.6460959108331592e-05,
"loss": 0.4182,
"step": 2250
},
{
"epoch": 1.1127178346310715,
"grad_norm": 0.11642842699947192,
"learning_rate": 1.6457986580700753e-05,
"loss": 0.4476,
"step": 2251
},
{
"epoch": 1.1132122110987517,
"grad_norm": 0.990652809057328,
"learning_rate": 1.645501307388413e-05,
"loss": 0.4077,
"step": 2252
},
{
"epoch": 1.1137065875664318,
"grad_norm": 0.13239413183255844,
"learning_rate": 1.6452038588332583e-05,
"loss": 0.4162,
"step": 2253
},
{
"epoch": 1.114200964034112,
"grad_norm": 0.13626236381986162,
"learning_rate": 1.644906312449711e-05,
"loss": 0.4264,
"step": 2254
},
{
"epoch": 1.114695340501792,
"grad_norm": 0.13000081916567144,
"learning_rate": 1.6446086682828865e-05,
"loss": 0.3698,
"step": 2255
},
{
"epoch": 1.1151897169694722,
"grad_norm": 0.15265896555092573,
"learning_rate": 1.6443109263779145e-05,
"loss": 0.4006,
"step": 2256
},
{
"epoch": 1.1156840934371524,
"grad_norm": 0.13795578814489654,
"learning_rate": 1.6440130867799404e-05,
"loss": 0.4055,
"step": 2257
},
{
"epoch": 1.1161784699048325,
"grad_norm": 0.13236467493317516,
"learning_rate": 1.6437151495341234e-05,
"loss": 0.3923,
"step": 2258
},
{
"epoch": 1.1166728463725126,
"grad_norm": 0.1452849722676653,
"learning_rate": 1.643417114685638e-05,
"loss": 0.4017,
"step": 2259
},
{
"epoch": 1.1171672228401928,
"grad_norm": 0.1352305976308632,
"learning_rate": 1.6431189822796732e-05,
"loss": 0.3877,
"step": 2260
},
{
"epoch": 1.117661599307873,
"grad_norm": 0.12391851669635717,
"learning_rate": 1.6428207523614337e-05,
"loss": 0.4106,
"step": 2261
},
{
"epoch": 1.118155975775553,
"grad_norm": 0.13775021296888315,
"learning_rate": 1.642522424976138e-05,
"loss": 0.4003,
"step": 2262
},
{
"epoch": 1.1186503522432332,
"grad_norm": 0.12393395448424763,
"learning_rate": 1.6422240001690193e-05,
"loss": 0.4153,
"step": 2263
},
{
"epoch": 1.1191447287109133,
"grad_norm": 0.1218138213814887,
"learning_rate": 1.6419254779853268e-05,
"loss": 0.4296,
"step": 2264
},
{
"epoch": 1.1196391051785934,
"grad_norm": 0.12737476845635112,
"learning_rate": 1.6416268584703225e-05,
"loss": 0.4009,
"step": 2265
},
{
"epoch": 1.1201334816462736,
"grad_norm": 0.11078838187161161,
"learning_rate": 1.6413281416692853e-05,
"loss": 0.3999,
"step": 2266
},
{
"epoch": 1.1206278581139537,
"grad_norm": 0.12584023652526918,
"learning_rate": 1.641029327627507e-05,
"loss": 0.4235,
"step": 2267
},
{
"epoch": 1.1211222345816338,
"grad_norm": 0.11251492553225174,
"learning_rate": 1.6407304163902958e-05,
"loss": 0.4329,
"step": 2268
},
{
"epoch": 1.121616611049314,
"grad_norm": 0.11642908619826607,
"learning_rate": 1.6404314080029736e-05,
"loss": 0.3792,
"step": 2269
},
{
"epoch": 1.1221109875169941,
"grad_norm": 0.11527228533483551,
"learning_rate": 1.640132302510877e-05,
"loss": 0.4056,
"step": 2270
},
{
"epoch": 1.1226053639846743,
"grad_norm": 0.12529904982710482,
"learning_rate": 1.6398330999593573e-05,
"loss": 0.3969,
"step": 2271
},
{
"epoch": 1.1230997404523544,
"grad_norm": 0.12036422526602866,
"learning_rate": 1.639533800393781e-05,
"loss": 0.4078,
"step": 2272
},
{
"epoch": 1.1235941169200345,
"grad_norm": 0.13409644380554386,
"learning_rate": 1.63923440385953e-05,
"loss": 0.4189,
"step": 2273
},
{
"epoch": 1.1240884933877147,
"grad_norm": 0.11333779111993568,
"learning_rate": 1.6389349104019986e-05,
"loss": 0.3874,
"step": 2274
},
{
"epoch": 1.1245828698553948,
"grad_norm": 0.1195640921849143,
"learning_rate": 1.6386353200665982e-05,
"loss": 0.4032,
"step": 2275
},
{
"epoch": 1.125077246323075,
"grad_norm": 0.11556976247637107,
"learning_rate": 1.6383356328987535e-05,
"loss": 0.4094,
"step": 2276
},
{
"epoch": 1.125571622790755,
"grad_norm": 0.11374498305164688,
"learning_rate": 1.638035848943904e-05,
"loss": 0.4126,
"step": 2277
},
{
"epoch": 1.1260659992584352,
"grad_norm": 0.11146087105504611,
"learning_rate": 1.6377359682475047e-05,
"loss": 0.4472,
"step": 2278
},
{
"epoch": 1.1265603757261156,
"grad_norm": 0.11884683440916935,
"learning_rate": 1.6374359908550245e-05,
"loss": 0.4126,
"step": 2279
},
{
"epoch": 1.1270547521937955,
"grad_norm": 0.11557593015424181,
"learning_rate": 1.6371359168119467e-05,
"loss": 0.4054,
"step": 2280
},
{
"epoch": 1.1275491286614758,
"grad_norm": 0.1164825755484847,
"learning_rate": 1.6368357461637706e-05,
"loss": 0.415,
"step": 2281
},
{
"epoch": 1.1280435051291557,
"grad_norm": 0.12017913256254907,
"learning_rate": 1.6365354789560086e-05,
"loss": 0.4138,
"step": 2282
},
{
"epoch": 1.128537881596836,
"grad_norm": 0.11069532647451664,
"learning_rate": 1.6362351152341888e-05,
"loss": 0.3916,
"step": 2283
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.10511571353233921,
"learning_rate": 1.6359346550438533e-05,
"loss": 0.3886,
"step": 2284
},
{
"epoch": 1.1295266345321964,
"grad_norm": 0.488738661411721,
"learning_rate": 1.635634098430559e-05,
"loss": 0.434,
"step": 2285
},
{
"epoch": 1.1300210109998763,
"grad_norm": 0.3390303157097796,
"learning_rate": 1.635333445439878e-05,
"loss": 0.4067,
"step": 2286
},
{
"epoch": 1.1305153874675566,
"grad_norm": 0.11695262300412133,
"learning_rate": 1.635032696117396e-05,
"loss": 0.4248,
"step": 2287
},
{
"epoch": 1.1310097639352368,
"grad_norm": 0.11896119234416795,
"learning_rate": 1.6347318505087143e-05,
"loss": 0.3993,
"step": 2288
},
{
"epoch": 1.131504140402917,
"grad_norm": 0.11241133496991543,
"learning_rate": 1.634430908659448e-05,
"loss": 0.407,
"step": 2289
},
{
"epoch": 1.131998516870597,
"grad_norm": 0.11797558211831256,
"learning_rate": 1.6341298706152266e-05,
"loss": 0.3881,
"step": 2290
},
{
"epoch": 1.1324928933382772,
"grad_norm": 0.11283055868463943,
"learning_rate": 1.6338287364216954e-05,
"loss": 0.3939,
"step": 2291
},
{
"epoch": 1.1329872698059573,
"grad_norm": 0.12449217127908292,
"learning_rate": 1.6335275061245135e-05,
"loss": 0.4023,
"step": 2292
},
{
"epoch": 1.1334816462736375,
"grad_norm": 0.12559564138632157,
"learning_rate": 1.6332261797693545e-05,
"loss": 0.4309,
"step": 2293
},
{
"epoch": 1.1339760227413176,
"grad_norm": 0.1242993643725759,
"learning_rate": 1.6329247574019068e-05,
"loss": 0.4268,
"step": 2294
},
{
"epoch": 1.1344703992089977,
"grad_norm": 0.12335243596709858,
"learning_rate": 1.632623239067873e-05,
"loss": 0.4067,
"step": 2295
},
{
"epoch": 1.1349647756766779,
"grad_norm": 0.12353639897995017,
"learning_rate": 1.632321624812971e-05,
"loss": 0.3961,
"step": 2296
},
{
"epoch": 1.135459152144358,
"grad_norm": 0.10530762722812056,
"learning_rate": 1.6320199146829323e-05,
"loss": 0.4121,
"step": 2297
},
{
"epoch": 1.1359535286120381,
"grad_norm": 0.12089653790804981,
"learning_rate": 1.631718108723504e-05,
"loss": 0.4249,
"step": 2298
},
{
"epoch": 1.1364479050797183,
"grad_norm": 0.11478672823263426,
"learning_rate": 1.631416206980446e-05,
"loss": 0.3897,
"step": 2299
},
{
"epoch": 1.1369422815473984,
"grad_norm": 0.117795838758018,
"learning_rate": 1.631114209499535e-05,
"loss": 0.4168,
"step": 2300
},
{
"epoch": 1.1374366580150785,
"grad_norm": 0.12204050581446967,
"learning_rate": 1.6308121163265602e-05,
"loss": 0.4634,
"step": 2301
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.48205876267370673,
"learning_rate": 1.630509927507327e-05,
"loss": 0.4195,
"step": 2302
},
{
"epoch": 1.1384254109504388,
"grad_norm": 0.11545119852586282,
"learning_rate": 1.6302076430876545e-05,
"loss": 0.3964,
"step": 2303
},
{
"epoch": 1.138919787418119,
"grad_norm": 0.1066095518129816,
"learning_rate": 1.6299052631133753e-05,
"loss": 0.4129,
"step": 2304
},
{
"epoch": 1.139414163885799,
"grad_norm": 0.11256200650435136,
"learning_rate": 1.629602787630338e-05,
"loss": 0.413,
"step": 2305
},
{
"epoch": 1.1399085403534792,
"grad_norm": 0.18776077602249183,
"learning_rate": 1.629300216684405e-05,
"loss": 0.3867,
"step": 2306
},
{
"epoch": 1.1404029168211594,
"grad_norm": 0.10850077258553067,
"learning_rate": 1.628997550321454e-05,
"loss": 0.3872,
"step": 2307
},
{
"epoch": 1.1408972932888395,
"grad_norm": 0.11617259915112593,
"learning_rate": 1.6286947885873755e-05,
"loss": 0.4006,
"step": 2308
},
{
"epoch": 1.1413916697565196,
"grad_norm": 0.11856520771104712,
"learning_rate": 1.628391931528076e-05,
"loss": 0.4069,
"step": 2309
},
{
"epoch": 1.1418860462241998,
"grad_norm": 0.11516490487111733,
"learning_rate": 1.628088979189476e-05,
"loss": 0.4138,
"step": 2310
},
{
"epoch": 1.14238042269188,
"grad_norm": 0.1078623645891197,
"learning_rate": 1.6277859316175102e-05,
"loss": 0.4148,
"step": 2311
},
{
"epoch": 1.14287479915956,
"grad_norm": 0.12061060565774402,
"learning_rate": 1.6274827888581275e-05,
"loss": 0.4022,
"step": 2312
},
{
"epoch": 1.1433691756272402,
"grad_norm": 0.11949557038381152,
"learning_rate": 1.6271795509572922e-05,
"loss": 0.3913,
"step": 2313
},
{
"epoch": 1.1438635520949203,
"grad_norm": 0.1197113244331187,
"learning_rate": 1.6268762179609825e-05,
"loss": 0.4155,
"step": 2314
},
{
"epoch": 1.1443579285626004,
"grad_norm": 0.11626180647468998,
"learning_rate": 1.62657278991519e-05,
"loss": 0.4212,
"step": 2315
},
{
"epoch": 1.1448523050302806,
"grad_norm": 0.12689048829049446,
"learning_rate": 1.626269266865923e-05,
"loss": 0.434,
"step": 2316
},
{
"epoch": 1.1453466814979607,
"grad_norm": 0.25718317099360655,
"learning_rate": 1.625965648859202e-05,
"loss": 0.4347,
"step": 2317
},
{
"epoch": 1.1458410579656408,
"grad_norm": 0.12858198850422464,
"learning_rate": 1.6256619359410626e-05,
"loss": 0.4081,
"step": 2318
},
{
"epoch": 1.146335434433321,
"grad_norm": 0.11494714068093313,
"learning_rate": 1.625358128157556e-05,
"loss": 0.4042,
"step": 2319
},
{
"epoch": 1.1468298109010011,
"grad_norm": 0.12158184026675435,
"learning_rate": 1.6250542255547456e-05,
"loss": 0.4072,
"step": 2320
},
{
"epoch": 1.1473241873686812,
"grad_norm": 0.11488333599137801,
"learning_rate": 1.6247502281787115e-05,
"loss": 0.4023,
"step": 2321
},
{
"epoch": 1.1478185638363614,
"grad_norm": 0.10721061708564188,
"learning_rate": 1.624446136075546e-05,
"loss": 0.3869,
"step": 2322
},
{
"epoch": 1.1483129403040415,
"grad_norm": 0.12126066076313753,
"learning_rate": 1.6241419492913567e-05,
"loss": 0.4046,
"step": 2323
},
{
"epoch": 1.1488073167717217,
"grad_norm": 0.10874584770207024,
"learning_rate": 1.6238376678722664e-05,
"loss": 0.3989,
"step": 2324
},
{
"epoch": 1.1493016932394018,
"grad_norm": 0.11136669660508912,
"learning_rate": 1.6235332918644112e-05,
"loss": 0.4285,
"step": 2325
},
{
"epoch": 1.149796069707082,
"grad_norm": 0.13383995832537987,
"learning_rate": 1.6232288213139416e-05,
"loss": 0.391,
"step": 2326
},
{
"epoch": 1.150290446174762,
"grad_norm": 0.1168739142490018,
"learning_rate": 1.6229242562670226e-05,
"loss": 0.4146,
"step": 2327
},
{
"epoch": 1.1507848226424422,
"grad_norm": 0.11914967853570385,
"learning_rate": 1.622619596769834e-05,
"loss": 0.407,
"step": 2328
},
{
"epoch": 1.1512791991101223,
"grad_norm": 0.11532728239071716,
"learning_rate": 1.622314842868569e-05,
"loss": 0.4112,
"step": 2329
},
{
"epoch": 1.1517735755778025,
"grad_norm": 0.12343479681398462,
"learning_rate": 1.622009994609436e-05,
"loss": 0.3914,
"step": 2330
},
{
"epoch": 1.1522679520454826,
"grad_norm": 0.12724545972810938,
"learning_rate": 1.621705052038657e-05,
"loss": 0.4605,
"step": 2331
},
{
"epoch": 1.1527623285131627,
"grad_norm": 0.11725050222496113,
"learning_rate": 1.621400015202469e-05,
"loss": 0.3954,
"step": 2332
},
{
"epoch": 1.1532567049808429,
"grad_norm": 0.13568185858417825,
"learning_rate": 1.6210948841471226e-05,
"loss": 0.4155,
"step": 2333
},
{
"epoch": 1.153751081448523,
"grad_norm": 0.12541120621558846,
"learning_rate": 1.620789658918883e-05,
"loss": 0.381,
"step": 2334
},
{
"epoch": 1.1542454579162031,
"grad_norm": 0.1195277450525778,
"learning_rate": 1.6204843395640296e-05,
"loss": 0.4234,
"step": 2335
},
{
"epoch": 1.1547398343838833,
"grad_norm": 0.10817648185745089,
"learning_rate": 1.6201789261288564e-05,
"loss": 0.4032,
"step": 2336
},
{
"epoch": 1.1552342108515634,
"grad_norm": 0.11754399779466865,
"learning_rate": 1.619873418659671e-05,
"loss": 0.4093,
"step": 2337
},
{
"epoch": 1.1557285873192435,
"grad_norm": 0.11880387074699265,
"learning_rate": 1.6195678172027965e-05,
"loss": 0.4161,
"step": 2338
},
{
"epoch": 1.1562229637869237,
"grad_norm": 0.11345184784878355,
"learning_rate": 1.6192621218045687e-05,
"loss": 0.4327,
"step": 2339
},
{
"epoch": 1.1567173402546038,
"grad_norm": 0.14291825583599588,
"learning_rate": 1.618956332511338e-05,
"loss": 0.3863,
"step": 2340
},
{
"epoch": 1.157211716722284,
"grad_norm": 0.11512176239034502,
"learning_rate": 1.6186504493694704e-05,
"loss": 0.4104,
"step": 2341
},
{
"epoch": 1.157706093189964,
"grad_norm": 0.11016210494066732,
"learning_rate": 1.6183444724253443e-05,
"loss": 0.4171,
"step": 2342
},
{
"epoch": 1.1582004696576442,
"grad_norm": 0.1132200207977791,
"learning_rate": 1.6180384017253537e-05,
"loss": 0.4074,
"step": 2343
},
{
"epoch": 1.1586948461253244,
"grad_norm": 0.12492248503286019,
"learning_rate": 1.6177322373159062e-05,
"loss": 0.4456,
"step": 2344
},
{
"epoch": 1.1591892225930045,
"grad_norm": 0.13300960986464425,
"learning_rate": 1.6174259792434233e-05,
"loss": 0.4046,
"step": 2345
},
{
"epoch": 1.1596835990606846,
"grad_norm": 0.11097612322027857,
"learning_rate": 1.6171196275543414e-05,
"loss": 0.4013,
"step": 2346
},
{
"epoch": 1.1601779755283648,
"grad_norm": 0.11477567304748879,
"learning_rate": 1.6168131822951106e-05,
"loss": 0.3836,
"step": 2347
},
{
"epoch": 1.160672351996045,
"grad_norm": 0.11138455211705751,
"learning_rate": 1.6165066435121956e-05,
"loss": 0.3844,
"step": 2348
},
{
"epoch": 1.161166728463725,
"grad_norm": 0.1641284925171861,
"learning_rate": 1.6162000112520747e-05,
"loss": 0.431,
"step": 2349
},
{
"epoch": 1.1616611049314052,
"grad_norm": 0.11018557538123738,
"learning_rate": 1.6158932855612408e-05,
"loss": 0.4009,
"step": 2350
},
{
"epoch": 1.1621554813990853,
"grad_norm": 0.11568982554601197,
"learning_rate": 1.6155864664862012e-05,
"loss": 0.3946,
"step": 2351
},
{
"epoch": 1.1626498578667654,
"grad_norm": 0.1141596763568074,
"learning_rate": 1.6152795540734766e-05,
"loss": 0.4249,
"step": 2352
},
{
"epoch": 1.1631442343344456,
"grad_norm": 0.1140635607492471,
"learning_rate": 1.6149725483696027e-05,
"loss": 0.4183,
"step": 2353
},
{
"epoch": 1.163638610802126,
"grad_norm": 0.12371071827692312,
"learning_rate": 1.6146654494211283e-05,
"loss": 0.436,
"step": 2354
},
{
"epoch": 1.1641329872698059,
"grad_norm": 0.1194637289589214,
"learning_rate": 1.614358257274618e-05,
"loss": 0.4093,
"step": 2355
},
{
"epoch": 1.1646273637374862,
"grad_norm": 0.11300081221853289,
"learning_rate": 1.6140509719766484e-05,
"loss": 0.4189,
"step": 2356
},
{
"epoch": 1.1651217402051661,
"grad_norm": 0.117964597358986,
"learning_rate": 1.613743593573812e-05,
"loss": 0.403,
"step": 2357
},
{
"epoch": 1.1656161166728465,
"grad_norm": 0.11320148073068903,
"learning_rate": 1.613436122112715e-05,
"loss": 0.4287,
"step": 2358
},
{
"epoch": 1.1661104931405264,
"grad_norm": 0.11689293712536364,
"learning_rate": 1.6131285576399763e-05,
"loss": 0.4214,
"step": 2359
},
{
"epoch": 1.1666048696082068,
"grad_norm": 0.1134429708870943,
"learning_rate": 1.612820900202231e-05,
"loss": 0.3828,
"step": 2360
},
{
"epoch": 1.1670992460758867,
"grad_norm": 0.11164265948626025,
"learning_rate": 1.6125131498461272e-05,
"loss": 0.3762,
"step": 2361
},
{
"epoch": 1.167593622543567,
"grad_norm": 0.11313292186892417,
"learning_rate": 1.612205306618327e-05,
"loss": 0.4048,
"step": 2362
},
{
"epoch": 1.1680879990112472,
"grad_norm": 0.5326320827372908,
"learning_rate": 1.6118973705655073e-05,
"loss": 0.4015,
"step": 2363
},
{
"epoch": 1.1685823754789273,
"grad_norm": 0.1177829968645412,
"learning_rate": 1.611589341734358e-05,
"loss": 0.3921,
"step": 2364
},
{
"epoch": 1.1690767519466074,
"grad_norm": 0.10980506426070208,
"learning_rate": 1.611281220171584e-05,
"loss": 0.3992,
"step": 2365
},
{
"epoch": 1.1695711284142876,
"grad_norm": 0.11284742116287053,
"learning_rate": 1.610973005923904e-05,
"loss": 0.4035,
"step": 2366
},
{
"epoch": 1.1700655048819677,
"grad_norm": 0.11274093675363138,
"learning_rate": 1.6106646990380505e-05,
"loss": 0.3727,
"step": 2367
},
{
"epoch": 1.1705598813496478,
"grad_norm": 0.10997410436862104,
"learning_rate": 1.6103562995607705e-05,
"loss": 0.4154,
"step": 2368
},
{
"epoch": 1.171054257817328,
"grad_norm": 0.12332889914909327,
"learning_rate": 1.6100478075388242e-05,
"loss": 0.4164,
"step": 2369
},
{
"epoch": 1.171548634285008,
"grad_norm": 0.13018669788445691,
"learning_rate": 1.6097392230189868e-05,
"loss": 0.4763,
"step": 2370
},
{
"epoch": 1.1720430107526882,
"grad_norm": 1.1534184319761278,
"learning_rate": 1.609430546048047e-05,
"loss": 0.393,
"step": 2371
},
{
"epoch": 1.1725373872203684,
"grad_norm": 0.13296542396416813,
"learning_rate": 1.6091217766728077e-05,
"loss": 0.4068,
"step": 2372
},
{
"epoch": 1.1730317636880485,
"grad_norm": 0.13179863975760822,
"learning_rate": 1.608812914940086e-05,
"loss": 0.4048,
"step": 2373
},
{
"epoch": 1.1735261401557286,
"grad_norm": 0.13144684490060082,
"learning_rate": 1.6085039608967123e-05,
"loss": 0.4155,
"step": 2374
},
{
"epoch": 1.1740205166234088,
"grad_norm": 0.14548568598134917,
"learning_rate": 1.608194914589532e-05,
"loss": 0.4069,
"step": 2375
},
{
"epoch": 1.174514893091089,
"grad_norm": 0.12763057510939477,
"learning_rate": 1.6078857760654034e-05,
"loss": 0.4274,
"step": 2376
},
{
"epoch": 1.175009269558769,
"grad_norm": 0.13842504691094293,
"learning_rate": 1.6075765453711992e-05,
"loss": 0.4239,
"step": 2377
},
{
"epoch": 1.1755036460264492,
"grad_norm": 0.1566712031468782,
"learning_rate": 1.6072672225538066e-05,
"loss": 0.3896,
"step": 2378
},
{
"epoch": 1.1759980224941293,
"grad_norm": 0.13239359543168575,
"learning_rate": 1.6069578076601265e-05,
"loss": 0.4331,
"step": 2379
},
{
"epoch": 1.1764923989618095,
"grad_norm": 0.1517090901083012,
"learning_rate": 1.606648300737073e-05,
"loss": 0.388,
"step": 2380
},
{
"epoch": 1.1769867754294896,
"grad_norm": 0.1174855756384191,
"learning_rate": 1.6063387018315756e-05,
"loss": 0.3891,
"step": 2381
},
{
"epoch": 1.1774811518971697,
"grad_norm": 0.13861135771299168,
"learning_rate": 1.6060290109905766e-05,
"loss": 0.3926,
"step": 2382
},
{
"epoch": 1.1779755283648499,
"grad_norm": 0.18479923620863378,
"learning_rate": 1.605719228261032e-05,
"loss": 0.4147,
"step": 2383
},
{
"epoch": 1.17846990483253,
"grad_norm": 0.12394870560825393,
"learning_rate": 1.6054093536899132e-05,
"loss": 0.4213,
"step": 2384
},
{
"epoch": 1.1789642813002101,
"grad_norm": 0.1490244140821534,
"learning_rate": 1.605099387324204e-05,
"loss": 0.4277,
"step": 2385
},
{
"epoch": 1.1794586577678903,
"grad_norm": 0.35487792562325593,
"learning_rate": 1.6047893292109026e-05,
"loss": 0.4044,
"step": 2386
},
{
"epoch": 1.1799530342355704,
"grad_norm": 0.12726317099736625,
"learning_rate": 1.6044791793970217e-05,
"loss": 0.404,
"step": 2387
},
{
"epoch": 1.1804474107032505,
"grad_norm": 0.12780383796816566,
"learning_rate": 1.604168937929588e-05,
"loss": 0.4012,
"step": 2388
},
{
"epoch": 1.1809417871709307,
"grad_norm": 0.12897223446100492,
"learning_rate": 1.6038586048556402e-05,
"loss": 0.3951,
"step": 2389
},
{
"epoch": 1.1814361636386108,
"grad_norm": 0.12851745096971784,
"learning_rate": 1.6035481802222333e-05,
"loss": 0.448,
"step": 2390
},
{
"epoch": 1.181930540106291,
"grad_norm": 0.1383833646901533,
"learning_rate": 1.6032376640764345e-05,
"loss": 0.393,
"step": 2391
},
{
"epoch": 1.182424916573971,
"grad_norm": 0.11358269147431659,
"learning_rate": 1.6029270564653258e-05,
"loss": 0.4475,
"step": 2392
},
{
"epoch": 1.1829192930416512,
"grad_norm": 0.12626515704732655,
"learning_rate": 1.602616357436003e-05,
"loss": 0.399,
"step": 2393
},
{
"epoch": 1.1834136695093314,
"grad_norm": 0.12140790225015116,
"learning_rate": 1.6023055670355748e-05,
"loss": 0.3955,
"step": 2394
},
{
"epoch": 1.1839080459770115,
"grad_norm": 0.1253247789977264,
"learning_rate": 1.6019946853111654e-05,
"loss": 0.4036,
"step": 2395
},
{
"epoch": 1.1844024224446916,
"grad_norm": 0.118484625705584,
"learning_rate": 1.6016837123099112e-05,
"loss": 0.4091,
"step": 2396
},
{
"epoch": 1.1848967989123718,
"grad_norm": 0.11725314568361932,
"learning_rate": 1.601372648078963e-05,
"loss": 0.4159,
"step": 2397
},
{
"epoch": 1.185391175380052,
"grad_norm": 0.1209343543878476,
"learning_rate": 1.6010614926654868e-05,
"loss": 0.3942,
"step": 2398
},
{
"epoch": 1.185885551847732,
"grad_norm": 0.19465694723521135,
"learning_rate": 1.60075024611666e-05,
"loss": 0.4425,
"step": 2399
},
{
"epoch": 1.1863799283154122,
"grad_norm": 0.11409524906556277,
"learning_rate": 1.600438908479676e-05,
"loss": 0.4573,
"step": 2400
},
{
"epoch": 1.1868743047830923,
"grad_norm": 0.13002786686901446,
"learning_rate": 1.6001274798017405e-05,
"loss": 0.4289,
"step": 2401
},
{
"epoch": 1.1873686812507724,
"grad_norm": 0.11089078659010689,
"learning_rate": 1.5998159601300734e-05,
"loss": 0.4045,
"step": 2402
},
{
"epoch": 1.1878630577184526,
"grad_norm": 0.13028211967213385,
"learning_rate": 1.599504349511909e-05,
"loss": 0.4287,
"step": 2403
},
{
"epoch": 1.1883574341861327,
"grad_norm": 0.14377794308086542,
"learning_rate": 1.5991926479944944e-05,
"loss": 0.3696,
"step": 2404
},
{
"epoch": 1.1888518106538128,
"grad_norm": 0.12452765458699373,
"learning_rate": 1.5988808556250918e-05,
"loss": 0.4263,
"step": 2405
},
{
"epoch": 1.189346187121493,
"grad_norm": 0.14278984299427658,
"learning_rate": 1.5985689724509755e-05,
"loss": 0.3974,
"step": 2406
},
{
"epoch": 1.1898405635891731,
"grad_norm": 0.13004747588463836,
"learning_rate": 1.5982569985194355e-05,
"loss": 0.3802,
"step": 2407
},
{
"epoch": 1.1903349400568533,
"grad_norm": 0.11245728085617632,
"learning_rate": 1.5979449338777738e-05,
"loss": 0.402,
"step": 2408
},
{
"epoch": 1.1908293165245334,
"grad_norm": 0.12466685172495812,
"learning_rate": 1.5976327785733073e-05,
"loss": 0.4205,
"step": 2409
},
{
"epoch": 1.1913236929922135,
"grad_norm": 0.11299655908535834,
"learning_rate": 1.597320532653366e-05,
"loss": 0.3895,
"step": 2410
},
{
"epoch": 1.1918180694598937,
"grad_norm": 0.12356475417498551,
"learning_rate": 1.5970081961652937e-05,
"loss": 0.422,
"step": 2411
},
{
"epoch": 1.1923124459275738,
"grad_norm": 0.15345445787230874,
"learning_rate": 1.5966957691564485e-05,
"loss": 0.3958,
"step": 2412
},
{
"epoch": 1.192806822395254,
"grad_norm": 0.12264323329006703,
"learning_rate": 1.5963832516742016e-05,
"loss": 0.4344,
"step": 2413
},
{
"epoch": 1.193301198862934,
"grad_norm": 0.12011445879274492,
"learning_rate": 1.596070643765938e-05,
"loss": 0.3977,
"step": 2414
},
{
"epoch": 1.1937955753306142,
"grad_norm": 0.1307346607566152,
"learning_rate": 1.5957579454790574e-05,
"loss": 0.3992,
"step": 2415
},
{
"epoch": 1.1942899517982943,
"grad_norm": 0.19261204848925667,
"learning_rate": 1.595445156860971e-05,
"loss": 0.3883,
"step": 2416
},
{
"epoch": 1.1947843282659745,
"grad_norm": 0.13131855919915528,
"learning_rate": 1.595132277959106e-05,
"loss": 0.4224,
"step": 2417
},
{
"epoch": 1.1952787047336546,
"grad_norm": 0.11349846774280349,
"learning_rate": 1.5948193088209024e-05,
"loss": 0.4045,
"step": 2418
},
{
"epoch": 1.1957730812013347,
"grad_norm": 0.14934384238684825,
"learning_rate": 1.5945062494938136e-05,
"loss": 0.4005,
"step": 2419
},
{
"epoch": 1.1962674576690149,
"grad_norm": 0.12443658466699761,
"learning_rate": 1.594193100025307e-05,
"loss": 0.3744,
"step": 2420
},
{
"epoch": 1.196761834136695,
"grad_norm": 0.12241150787805768,
"learning_rate": 1.593879860462863e-05,
"loss": 0.3999,
"step": 2421
},
{
"epoch": 1.1972562106043751,
"grad_norm": 0.10898861062708047,
"learning_rate": 1.593566530853977e-05,
"loss": 0.3921,
"step": 2422
},
{
"epoch": 1.1977505870720553,
"grad_norm": 0.1123895682434288,
"learning_rate": 1.593253111246157e-05,
"loss": 0.411,
"step": 2423
},
{
"epoch": 1.1982449635397354,
"grad_norm": 0.12123837078367888,
"learning_rate": 1.5929396016869247e-05,
"loss": 0.3793,
"step": 2424
},
{
"epoch": 1.1987393400074156,
"grad_norm": 0.10976953141475855,
"learning_rate": 1.5926260022238163e-05,
"loss": 0.4366,
"step": 2425
},
{
"epoch": 1.1992337164750957,
"grad_norm": 0.1251042104604036,
"learning_rate": 1.5923123129043806e-05,
"loss": 0.4199,
"step": 2426
},
{
"epoch": 1.199728092942776,
"grad_norm": 0.1145449106909501,
"learning_rate": 1.59199853377618e-05,
"loss": 0.4129,
"step": 2427
},
{
"epoch": 1.200222469410456,
"grad_norm": 0.49915375799961476,
"learning_rate": 1.5916846648867918e-05,
"loss": 0.3925,
"step": 2428
},
{
"epoch": 1.2007168458781363,
"grad_norm": 0.1093761178960894,
"learning_rate": 1.5913707062838053e-05,
"loss": 0.4096,
"step": 2429
},
{
"epoch": 1.2012112223458162,
"grad_norm": 0.12987489377876535,
"learning_rate": 1.5910566580148248e-05,
"loss": 0.411,
"step": 2430
},
{
"epoch": 1.2017055988134966,
"grad_norm": 0.12140186455374934,
"learning_rate": 1.590742520127467e-05,
"loss": 0.4057,
"step": 2431
},
{
"epoch": 1.2021999752811765,
"grad_norm": 0.11407054103397875,
"learning_rate": 1.590428292669363e-05,
"loss": 0.4212,
"step": 2432
},
{
"epoch": 1.2026943517488569,
"grad_norm": 0.1255879898474546,
"learning_rate": 1.590113975688158e-05,
"loss": 0.3829,
"step": 2433
},
{
"epoch": 1.2031887282165368,
"grad_norm": 0.11721444597454439,
"learning_rate": 1.5897995692315084e-05,
"loss": 0.4083,
"step": 2434
},
{
"epoch": 1.2036831046842171,
"grad_norm": 0.1146330083438773,
"learning_rate": 1.589485073347087e-05,
"loss": 0.4024,
"step": 2435
},
{
"epoch": 1.2041774811518973,
"grad_norm": 0.12348416703601107,
"learning_rate": 1.5891704880825784e-05,
"loss": 0.4024,
"step": 2436
},
{
"epoch": 1.2046718576195774,
"grad_norm": 0.1180151282167768,
"learning_rate": 1.5888558134856814e-05,
"loss": 0.4279,
"step": 2437
},
{
"epoch": 1.2051662340872575,
"grad_norm": 1.002118894926619,
"learning_rate": 1.5885410496041084e-05,
"loss": 0.4258,
"step": 2438
},
{
"epoch": 1.2056606105549377,
"grad_norm": 0.12142624940796676,
"learning_rate": 1.588226196485585e-05,
"loss": 0.4257,
"step": 2439
},
{
"epoch": 1.2061549870226178,
"grad_norm": 0.12353764849355418,
"learning_rate": 1.58791125417785e-05,
"loss": 0.4131,
"step": 2440
},
{
"epoch": 1.206649363490298,
"grad_norm": 0.12083974176325653,
"learning_rate": 1.587596222728657e-05,
"loss": 0.3953,
"step": 2441
},
{
"epoch": 1.207143739957978,
"grad_norm": 0.12511769164897962,
"learning_rate": 1.5872811021857724e-05,
"loss": 0.4278,
"step": 2442
},
{
"epoch": 1.2076381164256582,
"grad_norm": 0.13856138772590557,
"learning_rate": 1.586965892596975e-05,
"loss": 0.4078,
"step": 2443
},
{
"epoch": 1.2081324928933384,
"grad_norm": 0.12917277071492464,
"learning_rate": 1.5866505940100592e-05,
"loss": 0.4072,
"step": 2444
},
{
"epoch": 1.2086268693610185,
"grad_norm": 0.1308824344765974,
"learning_rate": 1.5863352064728313e-05,
"loss": 0.4054,
"step": 2445
},
{
"epoch": 1.2091212458286986,
"grad_norm": 0.12388878361223318,
"learning_rate": 1.5860197300331116e-05,
"loss": 0.392,
"step": 2446
},
{
"epoch": 1.2096156222963788,
"grad_norm": 0.13042287332021857,
"learning_rate": 1.5857041647387346e-05,
"loss": 0.401,
"step": 2447
},
{
"epoch": 1.210109998764059,
"grad_norm": 0.12137519772436851,
"learning_rate": 1.5853885106375466e-05,
"loss": 0.4558,
"step": 2448
},
{
"epoch": 1.210604375231739,
"grad_norm": 0.17783344420665584,
"learning_rate": 1.5850727677774088e-05,
"loss": 0.4014,
"step": 2449
},
{
"epoch": 1.2110987516994192,
"grad_norm": 0.12726682732546593,
"learning_rate": 1.5847569362061956e-05,
"loss": 0.4013,
"step": 2450
},
{
"epoch": 1.2115931281670993,
"grad_norm": 0.1161783036771493,
"learning_rate": 1.5844410159717943e-05,
"loss": 0.4343,
"step": 2451
},
{
"epoch": 1.2120875046347794,
"grad_norm": 0.13049418722752276,
"learning_rate": 1.5841250071221058e-05,
"loss": 0.4194,
"step": 2452
},
{
"epoch": 1.2125818811024596,
"grad_norm": 0.1109912265346569,
"learning_rate": 1.5838089097050453e-05,
"loss": 0.3862,
"step": 2453
},
{
"epoch": 1.2130762575701397,
"grad_norm": 0.12275887714678232,
"learning_rate": 1.58349272376854e-05,
"loss": 0.4135,
"step": 2454
},
{
"epoch": 1.2135706340378198,
"grad_norm": 0.11697877575090633,
"learning_rate": 1.583176449360532e-05,
"loss": 0.431,
"step": 2455
},
{
"epoch": 1.2140650105055,
"grad_norm": 0.12408120384811146,
"learning_rate": 1.582860086528976e-05,
"loss": 0.3856,
"step": 2456
},
{
"epoch": 1.21455938697318,
"grad_norm": 0.10834485178332241,
"learning_rate": 1.582543635321839e-05,
"loss": 0.4029,
"step": 2457
},
{
"epoch": 1.2150537634408602,
"grad_norm": 0.12107925860938756,
"learning_rate": 1.5822270957871048e-05,
"loss": 0.4092,
"step": 2458
},
{
"epoch": 1.2155481399085404,
"grad_norm": 0.27778830337709526,
"learning_rate": 1.5819104679727664e-05,
"loss": 0.3915,
"step": 2459
},
{
"epoch": 1.2160425163762205,
"grad_norm": 0.11512338279453174,
"learning_rate": 1.581593751926833e-05,
"loss": 0.4339,
"step": 2460
},
{
"epoch": 1.2165368928439007,
"grad_norm": 0.11797001444794626,
"learning_rate": 1.5812769476973266e-05,
"loss": 0.4088,
"step": 2461
},
{
"epoch": 1.2170312693115808,
"grad_norm": 0.11431723917570344,
"learning_rate": 1.5809600553322814e-05,
"loss": 0.44,
"step": 2462
},
{
"epoch": 1.217525645779261,
"grad_norm": 0.12758322045959347,
"learning_rate": 1.580643074879747e-05,
"loss": 0.4111,
"step": 2463
},
{
"epoch": 1.218020022246941,
"grad_norm": 0.1259788264360895,
"learning_rate": 1.5803260063877847e-05,
"loss": 0.4226,
"step": 2464
},
{
"epoch": 1.2185143987146212,
"grad_norm": 0.1175054097916265,
"learning_rate": 1.5800088499044696e-05,
"loss": 0.4318,
"step": 2465
},
{
"epoch": 1.2190087751823013,
"grad_norm": 0.1250209393906572,
"learning_rate": 1.5796916054778903e-05,
"loss": 0.4288,
"step": 2466
},
{
"epoch": 1.2195031516499815,
"grad_norm": 0.11822648619788391,
"learning_rate": 1.579374273156149e-05,
"loss": 0.4052,
"step": 2467
},
{
"epoch": 1.2199975281176616,
"grad_norm": 0.11806193521865306,
"learning_rate": 1.5790568529873603e-05,
"loss": 0.4131,
"step": 2468
},
{
"epoch": 1.2204919045853417,
"grad_norm": 0.11644022423811679,
"learning_rate": 1.5787393450196532e-05,
"loss": 0.444,
"step": 2469
},
{
"epoch": 1.2209862810530219,
"grad_norm": 0.1150919978193474,
"learning_rate": 1.5784217493011695e-05,
"loss": 0.4072,
"step": 2470
},
{
"epoch": 1.221480657520702,
"grad_norm": 0.10974645716205783,
"learning_rate": 1.578104065880064e-05,
"loss": 0.4119,
"step": 2471
},
{
"epoch": 1.2219750339883821,
"grad_norm": 0.11728379271267919,
"learning_rate": 1.5777862948045055e-05,
"loss": 0.4203,
"step": 2472
},
{
"epoch": 1.2224694104560623,
"grad_norm": 0.11319976183150726,
"learning_rate": 1.5774684361226754e-05,
"loss": 0.3811,
"step": 2473
},
{
"epoch": 1.2229637869237424,
"grad_norm": 0.11838803728112211,
"learning_rate": 1.577150489882769e-05,
"loss": 0.4016,
"step": 2474
},
{
"epoch": 1.2234581633914225,
"grad_norm": 0.11182194096995368,
"learning_rate": 1.5768324561329946e-05,
"loss": 0.409,
"step": 2475
},
{
"epoch": 1.2239525398591027,
"grad_norm": 0.11137694355004606,
"learning_rate": 1.5765143349215736e-05,
"loss": 0.395,
"step": 2476
},
{
"epoch": 1.2244469163267828,
"grad_norm": 0.113777174832844,
"learning_rate": 1.5761961262967405e-05,
"loss": 0.4116,
"step": 2477
},
{
"epoch": 1.224941292794463,
"grad_norm": 0.11587710531680866,
"learning_rate": 1.5758778303067442e-05,
"loss": 0.3781,
"step": 2478
},
{
"epoch": 1.225435669262143,
"grad_norm": 0.11052433643804668,
"learning_rate": 1.575559446999845e-05,
"loss": 0.4146,
"step": 2479
},
{
"epoch": 1.2259300457298232,
"grad_norm": 0.11511517872971888,
"learning_rate": 1.5752409764243184e-05,
"loss": 0.4095,
"step": 2480
},
{
"epoch": 1.2264244221975034,
"grad_norm": 0.11097546194710346,
"learning_rate": 1.5749224186284514e-05,
"loss": 0.4026,
"step": 2481
},
{
"epoch": 1.2269187986651835,
"grad_norm": 0.14047136198477914,
"learning_rate": 1.5746037736605454e-05,
"loss": 0.4191,
"step": 2482
},
{
"epoch": 1.2274131751328636,
"grad_norm": 0.1167888436958766,
"learning_rate": 1.574285041568915e-05,
"loss": 0.3832,
"step": 2483
},
{
"epoch": 1.2279075516005438,
"grad_norm": 0.1122626290733257,
"learning_rate": 1.5739662224018863e-05,
"loss": 0.4011,
"step": 2484
},
{
"epoch": 1.228401928068224,
"grad_norm": 0.11897954638355161,
"learning_rate": 1.5736473162078017e-05,
"loss": 0.3978,
"step": 2485
},
{
"epoch": 1.228896304535904,
"grad_norm": 0.11731907018259351,
"learning_rate": 1.573328323035014e-05,
"loss": 0.4424,
"step": 2486
},
{
"epoch": 1.2293906810035842,
"grad_norm": 0.11226788112986799,
"learning_rate": 1.57300924293189e-05,
"loss": 0.401,
"step": 2487
},
{
"epoch": 1.2298850574712643,
"grad_norm": 0.11886931060679291,
"learning_rate": 1.5726900759468104e-05,
"loss": 0.398,
"step": 2488
},
{
"epoch": 1.2303794339389444,
"grad_norm": 0.12043712497206317,
"learning_rate": 1.5723708221281688e-05,
"loss": 0.394,
"step": 2489
},
{
"epoch": 1.2308738104066246,
"grad_norm": 0.11339293972964658,
"learning_rate": 1.5720514815243714e-05,
"loss": 0.3968,
"step": 2490
},
{
"epoch": 1.2313681868743047,
"grad_norm": 0.11145204598327617,
"learning_rate": 1.5717320541838378e-05,
"loss": 0.399,
"step": 2491
},
{
"epoch": 1.2318625633419849,
"grad_norm": 0.11321649832342265,
"learning_rate": 1.571412540155001e-05,
"loss": 0.4469,
"step": 2492
},
{
"epoch": 1.232356939809665,
"grad_norm": 0.11575350055093042,
"learning_rate": 1.571092939486307e-05,
"loss": 0.3725,
"step": 2493
},
{
"epoch": 1.2328513162773451,
"grad_norm": 0.10804976501592281,
"learning_rate": 1.5707732522262148e-05,
"loss": 0.4199,
"step": 2494
},
{
"epoch": 1.2333456927450253,
"grad_norm": 0.12379692485918581,
"learning_rate": 1.5704534784231964e-05,
"loss": 0.4092,
"step": 2495
},
{
"epoch": 1.2338400692127054,
"grad_norm": 0.11352572372046799,
"learning_rate": 1.570133618125738e-05,
"loss": 0.4063,
"step": 2496
},
{
"epoch": 1.2343344456803855,
"grad_norm": 0.11142562620688533,
"learning_rate": 1.569813671382338e-05,
"loss": 0.383,
"step": 2497
},
{
"epoch": 1.2348288221480657,
"grad_norm": 0.10714131110477156,
"learning_rate": 1.569493638241507e-05,
"loss": 0.3958,
"step": 2498
},
{
"epoch": 1.2353231986157458,
"grad_norm": 0.11265737996513761,
"learning_rate": 1.5691735187517706e-05,
"loss": 0.3938,
"step": 2499
},
{
"epoch": 1.235817575083426,
"grad_norm": 0.11442987839917063,
"learning_rate": 1.5688533129616665e-05,
"loss": 0.4192,
"step": 2500
},
{
"epoch": 1.236311951551106,
"grad_norm": 0.11837870883990922,
"learning_rate": 1.5685330209197452e-05,
"loss": 0.4056,
"step": 2501
},
{
"epoch": 1.2368063280187864,
"grad_norm": 0.11384333892533793,
"learning_rate": 1.5682126426745714e-05,
"loss": 0.4025,
"step": 2502
},
{
"epoch": 1.2373007044864663,
"grad_norm": 0.10932716650349913,
"learning_rate": 1.567892178274721e-05,
"loss": 0.4491,
"step": 2503
},
{
"epoch": 1.2377950809541467,
"grad_norm": 0.20926135287903208,
"learning_rate": 1.5675716277687853e-05,
"loss": 0.3804,
"step": 2504
},
{
"epoch": 1.2382894574218266,
"grad_norm": 0.11114049031904621,
"learning_rate": 1.5672509912053664e-05,
"loss": 0.3964,
"step": 2505
},
{
"epoch": 1.238783833889507,
"grad_norm": 0.10893236204205238,
"learning_rate": 1.5669302686330812e-05,
"loss": 0.4285,
"step": 2506
},
{
"epoch": 1.2392782103571869,
"grad_norm": 0.11634524934405427,
"learning_rate": 1.566609460100559e-05,
"loss": 0.4007,
"step": 2507
},
{
"epoch": 1.2397725868248672,
"grad_norm": 0.11269191196448657,
"learning_rate": 1.5662885656564414e-05,
"loss": 0.3955,
"step": 2508
},
{
"epoch": 1.2402669632925472,
"grad_norm": 0.11510280031657395,
"learning_rate": 1.5659675853493844e-05,
"loss": 0.4191,
"step": 2509
},
{
"epoch": 1.2407613397602275,
"grad_norm": 0.11826264871821762,
"learning_rate": 1.5656465192280558e-05,
"loss": 0.4215,
"step": 2510
},
{
"epoch": 1.2412557162279076,
"grad_norm": 0.11474495969196728,
"learning_rate": 1.5653253673411372e-05,
"loss": 0.4125,
"step": 2511
},
{
"epoch": 1.2417500926955878,
"grad_norm": 0.12925261239181618,
"learning_rate": 1.565004129737323e-05,
"loss": 0.3958,
"step": 2512
},
{
"epoch": 1.242244469163268,
"grad_norm": 0.10443906939961242,
"learning_rate": 1.5646828064653202e-05,
"loss": 0.4517,
"step": 2513
},
{
"epoch": 1.242738845630948,
"grad_norm": 0.12346020022296166,
"learning_rate": 1.5643613975738495e-05,
"loss": 0.4202,
"step": 2514
},
{
"epoch": 1.2432332220986282,
"grad_norm": 0.10898291025276118,
"learning_rate": 1.564039903111644e-05,
"loss": 0.437,
"step": 2515
},
{
"epoch": 1.2437275985663083,
"grad_norm": 0.12035528733237513,
"learning_rate": 1.56371832312745e-05,
"loss": 0.3921,
"step": 2516
},
{
"epoch": 1.2442219750339885,
"grad_norm": 0.11563881292406988,
"learning_rate": 1.5633966576700265e-05,
"loss": 0.4186,
"step": 2517
},
{
"epoch": 1.2447163515016686,
"grad_norm": 0.13898962738226453,
"learning_rate": 1.5630749067881464e-05,
"loss": 0.418,
"step": 2518
},
{
"epoch": 1.2452107279693487,
"grad_norm": 0.11112182186403073,
"learning_rate": 1.5627530705305946e-05,
"loss": 0.3998,
"step": 2519
},
{
"epoch": 1.2457051044370289,
"grad_norm": 0.12477152313126118,
"learning_rate": 1.5624311489461684e-05,
"loss": 0.4355,
"step": 2520
},
{
"epoch": 1.246199480904709,
"grad_norm": 0.11700605819964413,
"learning_rate": 1.56210914208368e-05,
"loss": 0.4187,
"step": 2521
},
{
"epoch": 1.2466938573723891,
"grad_norm": 0.117883826310101,
"learning_rate": 1.5617870499919526e-05,
"loss": 0.4066,
"step": 2522
},
{
"epoch": 1.2471882338400693,
"grad_norm": 0.10803039944392147,
"learning_rate": 1.5614648727198232e-05,
"loss": 0.4137,
"step": 2523
},
{
"epoch": 1.2476826103077494,
"grad_norm": 0.11722092339914135,
"learning_rate": 1.561142610316142e-05,
"loss": 0.4086,
"step": 2524
},
{
"epoch": 1.2481769867754295,
"grad_norm": 0.1147124908756234,
"learning_rate": 1.5608202628297713e-05,
"loss": 0.4215,
"step": 2525
},
{
"epoch": 1.2486713632431097,
"grad_norm": 0.13454968002217338,
"learning_rate": 1.5604978303095867e-05,
"loss": 0.4173,
"step": 2526
},
{
"epoch": 1.2491657397107898,
"grad_norm": 0.11730476780312003,
"learning_rate": 1.5601753128044773e-05,
"loss": 0.4284,
"step": 2527
},
{
"epoch": 1.24966011617847,
"grad_norm": 0.11114986038108016,
"learning_rate": 1.559852710363344e-05,
"loss": 0.3971,
"step": 2528
},
{
"epoch": 1.25015449264615,
"grad_norm": 0.12532930325770225,
"learning_rate": 1.559530023035101e-05,
"loss": 0.406,
"step": 2529
},
{
"epoch": 1.2506488691138302,
"grad_norm": 0.1168970117107786,
"learning_rate": 1.5592072508686754e-05,
"loss": 0.4182,
"step": 2530
},
{
"epoch": 1.2506488691138302,
"eval_loss": 0.5011072158813477,
"eval_runtime": 100.9812,
"eval_samples_per_second": 300.591,
"eval_steps_per_second": 37.581,
"step": 2530
},
{
"epoch": 1.2511432455815104,
"grad_norm": 0.12852211713251888,
"learning_rate": 1.5588843939130077e-05,
"loss": 0.3858,
"step": 2531
},
{
"epoch": 1.2516376220491905,
"grad_norm": 0.11004129599285092,
"learning_rate": 1.5585614522170506e-05,
"loss": 0.401,
"step": 2532
},
{
"epoch": 1.2521319985168706,
"grad_norm": 0.13369466033604424,
"learning_rate": 1.5582384258297694e-05,
"loss": 0.4206,
"step": 2533
},
{
"epoch": 1.2526263749845508,
"grad_norm": 0.11043531444312991,
"learning_rate": 1.557915314800143e-05,
"loss": 0.38,
"step": 2534
},
{
"epoch": 1.253120751452231,
"grad_norm": 0.12764457760193867,
"learning_rate": 1.557592119177163e-05,
"loss": 0.3904,
"step": 2535
},
{
"epoch": 1.253615127919911,
"grad_norm": 0.22240188971451902,
"learning_rate": 1.5572688390098328e-05,
"loss": 0.4286,
"step": 2536
},
{
"epoch": 1.2541095043875912,
"grad_norm": 0.1217526849358331,
"learning_rate": 1.5569454743471702e-05,
"loss": 0.3926,
"step": 2537
},
{
"epoch": 1.2546038808552713,
"grad_norm": 0.11192582790135551,
"learning_rate": 1.556622025238205e-05,
"loss": 0.4174,
"step": 2538
},
{
"epoch": 1.2550982573229514,
"grad_norm": 0.11281588651883352,
"learning_rate": 1.5562984917319795e-05,
"loss": 0.4051,
"step": 2539
},
{
"epoch": 1.2555926337906316,
"grad_norm": 0.10804716415877169,
"learning_rate": 1.5559748738775493e-05,
"loss": 0.3895,
"step": 2540
},
{
"epoch": 1.2560870102583117,
"grad_norm": 0.11246715785902595,
"learning_rate": 1.5556511717239828e-05,
"loss": 0.4015,
"step": 2541
},
{
"epoch": 1.2565813867259918,
"grad_norm": 1.049730145411187,
"learning_rate": 1.5553273853203608e-05,
"loss": 0.3747,
"step": 2542
},
{
"epoch": 1.257075763193672,
"grad_norm": 0.11862159415390623,
"learning_rate": 1.555003514715777e-05,
"loss": 0.4145,
"step": 2543
},
{
"epoch": 1.2575701396613521,
"grad_norm": 0.11762364565473456,
"learning_rate": 1.554679559959338e-05,
"loss": 0.4179,
"step": 2544
},
{
"epoch": 1.2580645161290323,
"grad_norm": 0.12355271588138561,
"learning_rate": 1.5543555211001638e-05,
"loss": 0.3926,
"step": 2545
},
{
"epoch": 1.2585588925967124,
"grad_norm": 0.11846242130214704,
"learning_rate": 1.5540313981873853e-05,
"loss": 0.3799,
"step": 2546
},
{
"epoch": 1.2590532690643925,
"grad_norm": 0.12126025695065404,
"learning_rate": 1.5537071912701482e-05,
"loss": 0.412,
"step": 2547
},
{
"epoch": 1.2595476455320727,
"grad_norm": 0.120687383304594,
"learning_rate": 1.5533829003976098e-05,
"loss": 0.4118,
"step": 2548
},
{
"epoch": 1.2600420219997528,
"grad_norm": 0.13705071046327255,
"learning_rate": 1.55305852561894e-05,
"loss": 0.4352,
"step": 2549
},
{
"epoch": 1.260536398467433,
"grad_norm": 0.13020653058431864,
"learning_rate": 1.5527340669833227e-05,
"loss": 0.4069,
"step": 2550
},
{
"epoch": 1.261030774935113,
"grad_norm": 0.1346431875088589,
"learning_rate": 1.5524095245399525e-05,
"loss": 0.4296,
"step": 2551
},
{
"epoch": 1.2615251514027932,
"grad_norm": 0.12358482014799109,
"learning_rate": 1.5520848983380386e-05,
"loss": 0.3907,
"step": 2552
},
{
"epoch": 1.2620195278704733,
"grad_norm": 0.2577601114143098,
"learning_rate": 1.5517601884268022e-05,
"loss": 0.4228,
"step": 2553
},
{
"epoch": 1.2625139043381535,
"grad_norm": 0.10891931875986724,
"learning_rate": 1.5514353948554765e-05,
"loss": 0.3853,
"step": 2554
},
{
"epoch": 1.2630082808058336,
"grad_norm": 0.11598474640635033,
"learning_rate": 1.5511105176733084e-05,
"loss": 0.4348,
"step": 2555
},
{
"epoch": 1.2635026572735137,
"grad_norm": 0.11694246685215766,
"learning_rate": 1.550785556929557e-05,
"loss": 0.4208,
"step": 2556
},
{
"epoch": 1.2639970337411939,
"grad_norm": 0.13282385133882185,
"learning_rate": 1.550460512673494e-05,
"loss": 0.3997,
"step": 2557
},
{
"epoch": 1.264491410208874,
"grad_norm": 0.12018442053788646,
"learning_rate": 1.5501353849544046e-05,
"loss": 0.4031,
"step": 2558
},
{
"epoch": 1.2649857866765541,
"grad_norm": 0.1175444592947978,
"learning_rate": 1.5498101738215847e-05,
"loss": 0.4144,
"step": 2559
},
{
"epoch": 1.2654801631442343,
"grad_norm": 0.11147603680342978,
"learning_rate": 1.5494848793243456e-05,
"loss": 0.4256,
"step": 2560
},
{
"epoch": 1.2659745396119144,
"grad_norm": 0.1284486267159666,
"learning_rate": 1.5491595015120086e-05,
"loss": 0.4023,
"step": 2561
},
{
"epoch": 1.2664689160795946,
"grad_norm": 0.10750152762129385,
"learning_rate": 1.548834040433909e-05,
"loss": 0.3904,
"step": 2562
},
{
"epoch": 1.2669632925472747,
"grad_norm": 0.1301307366280665,
"learning_rate": 1.548508496139395e-05,
"loss": 0.4023,
"step": 2563
},
{
"epoch": 1.2674576690149548,
"grad_norm": 0.11121882293683066,
"learning_rate": 1.5481828686778266e-05,
"loss": 0.4152,
"step": 2564
},
{
"epoch": 1.267952045482635,
"grad_norm": 0.11793135430305372,
"learning_rate": 1.547857158098577e-05,
"loss": 0.4181,
"step": 2565
},
{
"epoch": 1.268446421950315,
"grad_norm": 0.11377594968022556,
"learning_rate": 1.547531364451031e-05,
"loss": 0.4057,
"step": 2566
},
{
"epoch": 1.2689407984179952,
"grad_norm": 0.11769380371884686,
"learning_rate": 1.5472054877845876e-05,
"loss": 0.4374,
"step": 2567
},
{
"epoch": 1.2694351748856754,
"grad_norm": 0.12313916000157646,
"learning_rate": 1.546879528148657e-05,
"loss": 0.4158,
"step": 2568
},
{
"epoch": 1.2699295513533555,
"grad_norm": 0.1393950129157171,
"learning_rate": 1.5465534855926626e-05,
"loss": 0.3903,
"step": 2569
},
{
"epoch": 1.2704239278210356,
"grad_norm": 0.12127099480378364,
"learning_rate": 1.5462273601660407e-05,
"loss": 0.4444,
"step": 2570
},
{
"epoch": 1.270918304288716,
"grad_norm": 0.11548963566239484,
"learning_rate": 1.5459011519182393e-05,
"loss": 0.3977,
"step": 2571
},
{
"epoch": 1.271412680756396,
"grad_norm": 0.12205631249612363,
"learning_rate": 1.5455748608987192e-05,
"loss": 0.3662,
"step": 2572
},
{
"epoch": 1.2719070572240763,
"grad_norm": 0.11292071101102473,
"learning_rate": 1.5452484871569545e-05,
"loss": 0.3914,
"step": 2573
},
{
"epoch": 1.2724014336917562,
"grad_norm": 0.11064152833620496,
"learning_rate": 1.5449220307424312e-05,
"loss": 0.3982,
"step": 2574
},
{
"epoch": 1.2728958101594365,
"grad_norm": 0.11044789186585659,
"learning_rate": 1.5445954917046477e-05,
"loss": 0.4128,
"step": 2575
},
{
"epoch": 1.2733901866271164,
"grad_norm": 0.109211302538864,
"learning_rate": 1.5442688700931152e-05,
"loss": 0.417,
"step": 2576
},
{
"epoch": 1.2738845630947968,
"grad_norm": 0.12091180578064518,
"learning_rate": 1.543942165957357e-05,
"loss": 0.3936,
"step": 2577
},
{
"epoch": 1.2743789395624767,
"grad_norm": 0.10581755908311492,
"learning_rate": 1.5436153793469102e-05,
"loss": 0.3719,
"step": 2578
},
{
"epoch": 1.274873316030157,
"grad_norm": 0.1120041704132693,
"learning_rate": 1.543288510311323e-05,
"loss": 0.4241,
"step": 2579
},
{
"epoch": 1.275367692497837,
"grad_norm": 0.11523518898940618,
"learning_rate": 1.542961558900156e-05,
"loss": 0.4112,
"step": 2580
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.12213363769757164,
"learning_rate": 1.542634525162984e-05,
"loss": 0.4185,
"step": 2581
},
{
"epoch": 1.2763564454331973,
"grad_norm": 0.20459989448984175,
"learning_rate": 1.5423074091493928e-05,
"loss": 0.3923,
"step": 2582
},
{
"epoch": 1.2768508219008776,
"grad_norm": 0.11572526579699191,
"learning_rate": 1.5419802109089803e-05,
"loss": 0.4117,
"step": 2583
},
{
"epoch": 1.2773451983685575,
"grad_norm": 0.11417957845334861,
"learning_rate": 1.541652930491359e-05,
"loss": 0.4064,
"step": 2584
},
{
"epoch": 1.277839574836238,
"grad_norm": 0.1269182211190018,
"learning_rate": 1.5413255679461506e-05,
"loss": 0.4077,
"step": 2585
},
{
"epoch": 1.2783339513039178,
"grad_norm": 0.11006793829051122,
"learning_rate": 1.540998123322993e-05,
"loss": 0.4023,
"step": 2586
},
{
"epoch": 1.2788283277715982,
"grad_norm": 0.11062041831877233,
"learning_rate": 1.540670596671533e-05,
"loss": 0.4008,
"step": 2587
},
{
"epoch": 1.279322704239278,
"grad_norm": 0.11261374081478946,
"learning_rate": 1.540342988041433e-05,
"loss": 0.4087,
"step": 2588
},
{
"epoch": 1.2798170807069584,
"grad_norm": 0.1178782627754105,
"learning_rate": 1.5400152974823653e-05,
"loss": 0.4048,
"step": 2589
},
{
"epoch": 1.2803114571746386,
"grad_norm": 0.12283632612134071,
"learning_rate": 1.5396875250440168e-05,
"loss": 0.405,
"step": 2590
},
{
"epoch": 1.2808058336423187,
"grad_norm": 0.11932186328319785,
"learning_rate": 1.539359670776084e-05,
"loss": 0.4051,
"step": 2591
},
{
"epoch": 1.2813002101099988,
"grad_norm": 0.2597891368089595,
"learning_rate": 1.5390317347282787e-05,
"loss": 0.402,
"step": 2592
},
{
"epoch": 1.281794586577679,
"grad_norm": 0.1106842527772952,
"learning_rate": 1.5387037169503237e-05,
"loss": 0.3954,
"step": 2593
},
{
"epoch": 1.282288963045359,
"grad_norm": 0.12343413830813657,
"learning_rate": 1.538375617491954e-05,
"loss": 0.4161,
"step": 2594
},
{
"epoch": 1.2827833395130392,
"grad_norm": 0.11051916843812927,
"learning_rate": 1.5380474364029175e-05,
"loss": 0.3996,
"step": 2595
},
{
"epoch": 1.2832777159807194,
"grad_norm": 0.11906408601665734,
"learning_rate": 1.5377191737329744e-05,
"loss": 0.4229,
"step": 2596
},
{
"epoch": 1.2837720924483995,
"grad_norm": 0.10921206721938953,
"learning_rate": 1.5373908295318973e-05,
"loss": 0.4122,
"step": 2597
},
{
"epoch": 1.2842664689160797,
"grad_norm": 0.11254531747121116,
"learning_rate": 1.537062403849471e-05,
"loss": 0.3979,
"step": 2598
},
{
"epoch": 1.2847608453837598,
"grad_norm": 0.11283804091910969,
"learning_rate": 1.5367338967354924e-05,
"loss": 0.4266,
"step": 2599
},
{
"epoch": 1.28525522185144,
"grad_norm": 0.11191057714777795,
"learning_rate": 1.5364053082397717e-05,
"loss": 0.432,
"step": 2600
},
{
"epoch": 1.28574959831912,
"grad_norm": 0.11305310318264832,
"learning_rate": 1.5360766384121304e-05,
"loss": 0.4164,
"step": 2601
},
{
"epoch": 1.2862439747868002,
"grad_norm": 0.17040693070116986,
"learning_rate": 1.5357478873024024e-05,
"loss": 0.3919,
"step": 2602
},
{
"epoch": 1.2867383512544803,
"grad_norm": 0.11666041234742591,
"learning_rate": 1.535419054960435e-05,
"loss": 0.4058,
"step": 2603
},
{
"epoch": 1.2872327277221605,
"grad_norm": 0.1106433381690211,
"learning_rate": 1.535090141436087e-05,
"loss": 0.405,
"step": 2604
},
{
"epoch": 1.2877271041898406,
"grad_norm": 0.11121975949520675,
"learning_rate": 1.5347611467792284e-05,
"loss": 0.4065,
"step": 2605
},
{
"epoch": 1.2882214806575207,
"grad_norm": 0.11600516386665975,
"learning_rate": 1.5344320710397442e-05,
"loss": 0.3994,
"step": 2606
},
{
"epoch": 1.2887158571252009,
"grad_norm": 0.1159718500217669,
"learning_rate": 1.5341029142675297e-05,
"loss": 0.4232,
"step": 2607
},
{
"epoch": 1.289210233592881,
"grad_norm": 0.12233525631519639,
"learning_rate": 1.5337736765124925e-05,
"loss": 0.4262,
"step": 2608
},
{
"epoch": 1.2897046100605611,
"grad_norm": 0.10700999332932709,
"learning_rate": 1.5334443578245535e-05,
"loss": 0.4244,
"step": 2609
},
{
"epoch": 1.2901989865282413,
"grad_norm": 0.12748929937042078,
"learning_rate": 1.5331149582536447e-05,
"loss": 0.4214,
"step": 2610
},
{
"epoch": 1.2906933629959214,
"grad_norm": 0.11192651490548293,
"learning_rate": 1.532785477849712e-05,
"loss": 0.4015,
"step": 2611
},
{
"epoch": 1.2911877394636015,
"grad_norm": 0.11610149230036697,
"learning_rate": 1.5324559166627115e-05,
"loss": 0.4398,
"step": 2612
},
{
"epoch": 1.2916821159312817,
"grad_norm": 0.12891752920998498,
"learning_rate": 1.532126274742613e-05,
"loss": 0.4212,
"step": 2613
},
{
"epoch": 1.2921764923989618,
"grad_norm": 0.1151806254727642,
"learning_rate": 1.5317965521393982e-05,
"loss": 0.4332,
"step": 2614
},
{
"epoch": 1.292670868866642,
"grad_norm": 0.12493010180966219,
"learning_rate": 1.531466748903061e-05,
"loss": 0.3882,
"step": 2615
},
{
"epoch": 1.293165245334322,
"grad_norm": 0.12010201610922902,
"learning_rate": 1.5311368650836077e-05,
"loss": 0.404,
"step": 2616
},
{
"epoch": 1.2936596218020022,
"grad_norm": 0.1270449672597615,
"learning_rate": 1.5308069007310557e-05,
"loss": 0.3924,
"step": 2617
},
{
"epoch": 1.2941539982696824,
"grad_norm": 0.11541885596322543,
"learning_rate": 1.530476855895436e-05,
"loss": 0.4404,
"step": 2618
},
{
"epoch": 1.2946483747373625,
"grad_norm": 0.10704544622643151,
"learning_rate": 1.530146730626792e-05,
"loss": 0.4143,
"step": 2619
},
{
"epoch": 1.2951427512050426,
"grad_norm": 0.12349763524074654,
"learning_rate": 1.5298165249751777e-05,
"loss": 0.4153,
"step": 2620
},
{
"epoch": 1.2956371276727228,
"grad_norm": 0.11557721933830965,
"learning_rate": 1.5294862389906607e-05,
"loss": 0.4031,
"step": 2621
},
{
"epoch": 1.296131504140403,
"grad_norm": 0.11218372510403529,
"learning_rate": 1.5291558727233198e-05,
"loss": 0.4017,
"step": 2622
},
{
"epoch": 1.296625880608083,
"grad_norm": 0.11236858652449949,
"learning_rate": 1.5288254262232474e-05,
"loss": 0.4184,
"step": 2623
},
{
"epoch": 1.2971202570757632,
"grad_norm": 0.1269078258645191,
"learning_rate": 1.5284948995405457e-05,
"loss": 0.4126,
"step": 2624
},
{
"epoch": 1.2976146335434433,
"grad_norm": 0.11428784112022755,
"learning_rate": 1.5281642927253318e-05,
"loss": 0.3876,
"step": 2625
},
{
"epoch": 1.2981090100111234,
"grad_norm": 0.1352750808183127,
"learning_rate": 1.527833605827733e-05,
"loss": 0.4019,
"step": 2626
},
{
"epoch": 1.2986033864788036,
"grad_norm": 0.1141074083395388,
"learning_rate": 1.5275028388978897e-05,
"loss": 0.4192,
"step": 2627
},
{
"epoch": 1.2990977629464837,
"grad_norm": 0.11797040187865367,
"learning_rate": 1.5271719919859536e-05,
"loss": 0.3957,
"step": 2628
},
{
"epoch": 1.2995921394141638,
"grad_norm": 0.11720148980684872,
"learning_rate": 1.526841065142089e-05,
"loss": 0.4129,
"step": 2629
},
{
"epoch": 1.300086515881844,
"grad_norm": 0.12444710655437867,
"learning_rate": 1.5265100584164733e-05,
"loss": 0.4167,
"step": 2630
},
{
"epoch": 1.3005808923495241,
"grad_norm": 0.12039204173411751,
"learning_rate": 1.5261789718592944e-05,
"loss": 0.4341,
"step": 2631
},
{
"epoch": 1.3010752688172043,
"grad_norm": 0.1135592319012706,
"learning_rate": 1.5258478055207527e-05,
"loss": 0.3895,
"step": 2632
},
{
"epoch": 1.3015696452848844,
"grad_norm": 0.10926450453701397,
"learning_rate": 1.5255165594510615e-05,
"loss": 0.4168,
"step": 2633
},
{
"epoch": 1.3020640217525645,
"grad_norm": 0.11903830281355515,
"learning_rate": 1.5251852337004454e-05,
"loss": 0.4426,
"step": 2634
},
{
"epoch": 1.3025583982202447,
"grad_norm": 0.12090955815850278,
"learning_rate": 1.5248538283191409e-05,
"loss": 0.4356,
"step": 2635
},
{
"epoch": 1.3030527746879248,
"grad_norm": 0.11985166684678672,
"learning_rate": 1.524522343357398e-05,
"loss": 0.4019,
"step": 2636
},
{
"epoch": 1.303547151155605,
"grad_norm": 0.13771270784660505,
"learning_rate": 1.524190778865477e-05,
"loss": 0.3647,
"step": 2637
},
{
"epoch": 1.304041527623285,
"grad_norm": 0.10746491993052895,
"learning_rate": 1.5238591348936516e-05,
"loss": 0.3916,
"step": 2638
},
{
"epoch": 1.3045359040909652,
"grad_norm": 0.11682736247781945,
"learning_rate": 1.5235274114922063e-05,
"loss": 0.4494,
"step": 2639
},
{
"epoch": 1.3050302805586453,
"grad_norm": 0.3493847562275763,
"learning_rate": 1.523195608711439e-05,
"loss": 0.4076,
"step": 2640
},
{
"epoch": 1.3055246570263255,
"grad_norm": 0.11322311771223255,
"learning_rate": 1.5228637266016585e-05,
"loss": 0.3959,
"step": 2641
},
{
"epoch": 1.3060190334940056,
"grad_norm": 0.11048716206921652,
"learning_rate": 1.5225317652131865e-05,
"loss": 0.4108,
"step": 2642
},
{
"epoch": 1.3065134099616857,
"grad_norm": 0.11967618178272206,
"learning_rate": 1.522199724596356e-05,
"loss": 0.4011,
"step": 2643
},
{
"epoch": 1.3070077864293659,
"grad_norm": 0.11805726302397829,
"learning_rate": 1.5218676048015125e-05,
"loss": 0.3872,
"step": 2644
},
{
"epoch": 1.307502162897046,
"grad_norm": 0.10474230805640848,
"learning_rate": 1.5215354058790128e-05,
"loss": 0.4028,
"step": 2645
},
{
"epoch": 1.3079965393647264,
"grad_norm": 0.11096706560662975,
"learning_rate": 1.5212031278792273e-05,
"loss": 0.4159,
"step": 2646
},
{
"epoch": 1.3084909158324063,
"grad_norm": 0.13924745400206984,
"learning_rate": 1.520870770852536e-05,
"loss": 0.4253,
"step": 2647
},
{
"epoch": 1.3089852923000866,
"grad_norm": 0.11152766941819511,
"learning_rate": 1.5205383348493334e-05,
"loss": 0.4261,
"step": 2648
},
{
"epoch": 1.3094796687677666,
"grad_norm": 0.11877463883217979,
"learning_rate": 1.5202058199200243e-05,
"loss": 0.4374,
"step": 2649
},
{
"epoch": 1.309974045235447,
"grad_norm": 0.11530280172267778,
"learning_rate": 1.5198732261150258e-05,
"loss": 0.4175,
"step": 2650
},
{
"epoch": 1.3104684217031268,
"grad_norm": 0.11695263525231293,
"learning_rate": 1.519540553484767e-05,
"loss": 0.4131,
"step": 2651
},
{
"epoch": 1.3109627981708072,
"grad_norm": 0.10758185090789703,
"learning_rate": 1.5192078020796896e-05,
"loss": 0.382,
"step": 2652
},
{
"epoch": 1.311457174638487,
"grad_norm": 0.11015366585264443,
"learning_rate": 1.5188749719502462e-05,
"loss": 0.4032,
"step": 2653
},
{
"epoch": 1.3119515511061675,
"grad_norm": 0.11238951879763605,
"learning_rate": 1.5185420631469022e-05,
"loss": 0.4146,
"step": 2654
},
{
"epoch": 1.3124459275738474,
"grad_norm": 0.2805228899024833,
"learning_rate": 1.518209075720134e-05,
"loss": 0.3986,
"step": 2655
},
{
"epoch": 1.3129403040415277,
"grad_norm": 0.10937540006092794,
"learning_rate": 1.5178760097204315e-05,
"loss": 0.3967,
"step": 2656
},
{
"epoch": 1.3134346805092076,
"grad_norm": 0.10448257199690417,
"learning_rate": 1.5175428651982942e-05,
"loss": 0.4157,
"step": 2657
},
{
"epoch": 1.313929056976888,
"grad_norm": 0.11276222390030465,
"learning_rate": 1.517209642204236e-05,
"loss": 0.4245,
"step": 2658
},
{
"epoch": 1.314423433444568,
"grad_norm": 0.1080246655779798,
"learning_rate": 1.5168763407887808e-05,
"loss": 0.407,
"step": 2659
},
{
"epoch": 1.3149178099122483,
"grad_norm": 0.10523091831520913,
"learning_rate": 1.5165429610024651e-05,
"loss": 0.4133,
"step": 2660
},
{
"epoch": 1.3154121863799282,
"grad_norm": 0.10747319431680052,
"learning_rate": 1.5162095028958377e-05,
"loss": 0.3977,
"step": 2661
},
{
"epoch": 1.3159065628476085,
"grad_norm": 0.11332184240465192,
"learning_rate": 1.5158759665194585e-05,
"loss": 0.4213,
"step": 2662
},
{
"epoch": 1.3164009393152885,
"grad_norm": 0.11619273136303294,
"learning_rate": 1.5155423519238998e-05,
"loss": 0.3985,
"step": 2663
},
{
"epoch": 1.3168953157829688,
"grad_norm": 0.10642677258021765,
"learning_rate": 1.5152086591597455e-05,
"loss": 0.4165,
"step": 2664
},
{
"epoch": 1.317389692250649,
"grad_norm": 0.11035457549647,
"learning_rate": 1.5148748882775914e-05,
"loss": 0.4019,
"step": 2665
},
{
"epoch": 1.317884068718329,
"grad_norm": 0.1095769390959713,
"learning_rate": 1.5145410393280453e-05,
"loss": 0.4031,
"step": 2666
},
{
"epoch": 1.3183784451860092,
"grad_norm": 0.10568289630359663,
"learning_rate": 1.5142071123617262e-05,
"loss": 0.4888,
"step": 2667
},
{
"epoch": 1.3188728216536894,
"grad_norm": 0.27265884425009224,
"learning_rate": 1.5138731074292663e-05,
"loss": 0.4214,
"step": 2668
},
{
"epoch": 1.3193671981213695,
"grad_norm": 0.12482168666738186,
"learning_rate": 1.5135390245813085e-05,
"loss": 0.4454,
"step": 2669
},
{
"epoch": 1.3198615745890496,
"grad_norm": 0.10764193978851198,
"learning_rate": 1.5132048638685073e-05,
"loss": 0.4156,
"step": 2670
},
{
"epoch": 1.3203559510567298,
"grad_norm": 0.10481487066188562,
"learning_rate": 1.51287062534153e-05,
"loss": 0.4165,
"step": 2671
},
{
"epoch": 1.32085032752441,
"grad_norm": 0.11752708099228938,
"learning_rate": 1.5125363090510549e-05,
"loss": 0.4224,
"step": 2672
},
{
"epoch": 1.32134470399209,
"grad_norm": 0.11033961585953489,
"learning_rate": 1.5122019150477724e-05,
"loss": 0.3944,
"step": 2673
},
{
"epoch": 1.3218390804597702,
"grad_norm": 0.11272406293364096,
"learning_rate": 1.5118674433823848e-05,
"loss": 0.405,
"step": 2674
},
{
"epoch": 1.3223334569274503,
"grad_norm": 0.1125918592121383,
"learning_rate": 1.511532894105606e-05,
"loss": 0.3834,
"step": 2675
},
{
"epoch": 1.3228278333951304,
"grad_norm": 0.11295550901107147,
"learning_rate": 1.5111982672681618e-05,
"loss": 0.4059,
"step": 2676
},
{
"epoch": 1.3233222098628106,
"grad_norm": 0.10401610251682596,
"learning_rate": 1.5108635629207893e-05,
"loss": 0.4331,
"step": 2677
},
{
"epoch": 1.3238165863304907,
"grad_norm": 0.10947828858058449,
"learning_rate": 1.5105287811142381e-05,
"loss": 0.4016,
"step": 2678
},
{
"epoch": 1.3243109627981708,
"grad_norm": 0.10971737289498616,
"learning_rate": 1.5101939218992688e-05,
"loss": 0.4286,
"step": 2679
},
{
"epoch": 1.324805339265851,
"grad_norm": 0.115499437903447,
"learning_rate": 1.5098589853266545e-05,
"loss": 0.4254,
"step": 2680
},
{
"epoch": 1.3252997157335311,
"grad_norm": 0.10762508241929146,
"learning_rate": 1.509523971447179e-05,
"loss": 0.428,
"step": 2681
},
{
"epoch": 1.3257940922012112,
"grad_norm": 0.10648483973986991,
"learning_rate": 1.5091888803116392e-05,
"loss": 0.4105,
"step": 2682
},
{
"epoch": 1.3262884686688914,
"grad_norm": 0.10707637618480699,
"learning_rate": 1.5088537119708426e-05,
"loss": 0.4026,
"step": 2683
},
{
"epoch": 1.3267828451365715,
"grad_norm": 0.10902863032495971,
"learning_rate": 1.5085184664756087e-05,
"loss": 0.425,
"step": 2684
},
{
"epoch": 1.3272772216042517,
"grad_norm": 0.11382272326396216,
"learning_rate": 1.5081831438767691e-05,
"loss": 0.3764,
"step": 2685
},
{
"epoch": 1.3277715980719318,
"grad_norm": 0.10622675369873154,
"learning_rate": 1.5078477442251665e-05,
"loss": 0.3964,
"step": 2686
},
{
"epoch": 1.328265974539612,
"grad_norm": 0.11208366208451291,
"learning_rate": 1.5075122675716548e-05,
"loss": 0.4255,
"step": 2687
},
{
"epoch": 1.328760351007292,
"grad_norm": 0.1131121995864864,
"learning_rate": 1.5071767139671018e-05,
"loss": 0.4197,
"step": 2688
},
{
"epoch": 1.3292547274749722,
"grad_norm": 0.11463456165933317,
"learning_rate": 1.5068410834623845e-05,
"loss": 0.4157,
"step": 2689
},
{
"epoch": 1.3297491039426523,
"grad_norm": 0.19918930693015566,
"learning_rate": 1.5065053761083927e-05,
"loss": 0.3799,
"step": 2690
},
{
"epoch": 1.3302434804103325,
"grad_norm": 0.12288696738011962,
"learning_rate": 1.5061695919560282e-05,
"loss": 0.4088,
"step": 2691
},
{
"epoch": 1.3307378568780126,
"grad_norm": 0.12050337137205067,
"learning_rate": 1.505833731056203e-05,
"loss": 0.4476,
"step": 2692
},
{
"epoch": 1.3312322333456927,
"grad_norm": 0.11578274742055278,
"learning_rate": 1.5054977934598425e-05,
"loss": 0.4152,
"step": 2693
},
{
"epoch": 1.3317266098133729,
"grad_norm": 0.11618108068018045,
"learning_rate": 1.5051617792178822e-05,
"loss": 0.4008,
"step": 2694
},
{
"epoch": 1.332220986281053,
"grad_norm": 0.12115391269862559,
"learning_rate": 1.5048256883812706e-05,
"loss": 0.4187,
"step": 2695
},
{
"epoch": 1.3327153627487331,
"grad_norm": 0.10487607197734826,
"learning_rate": 1.504489521000967e-05,
"loss": 0.3893,
"step": 2696
},
{
"epoch": 1.3332097392164133,
"grad_norm": 0.10760339837260187,
"learning_rate": 1.5041532771279422e-05,
"loss": 0.4164,
"step": 2697
},
{
"epoch": 1.3337041156840934,
"grad_norm": 0.11552394582055368,
"learning_rate": 1.5038169568131786e-05,
"loss": 0.3905,
"step": 2698
},
{
"epoch": 1.3341984921517736,
"grad_norm": 0.11278954443267343,
"learning_rate": 1.503480560107671e-05,
"loss": 0.4188,
"step": 2699
},
{
"epoch": 1.3346928686194537,
"grad_norm": 0.11353020276478129,
"learning_rate": 1.5031440870624247e-05,
"loss": 0.4134,
"step": 2700
},
{
"epoch": 1.3351872450871338,
"grad_norm": 0.11438379022089923,
"learning_rate": 1.5028075377284576e-05,
"loss": 0.4111,
"step": 2701
},
{
"epoch": 1.335681621554814,
"grad_norm": 0.11260795852235522,
"learning_rate": 1.5024709121567988e-05,
"loss": 0.395,
"step": 2702
},
{
"epoch": 1.336175998022494,
"grad_norm": 0.10690343914736161,
"learning_rate": 1.502134210398488e-05,
"loss": 0.4195,
"step": 2703
},
{
"epoch": 1.3366703744901742,
"grad_norm": 0.11527283656418859,
"learning_rate": 1.501797432504578e-05,
"loss": 0.4016,
"step": 2704
},
{
"epoch": 1.3371647509578544,
"grad_norm": 0.11335124690314863,
"learning_rate": 1.5014605785261318e-05,
"loss": 0.4111,
"step": 2705
},
{
"epoch": 1.3376591274255345,
"grad_norm": 0.11042174872557853,
"learning_rate": 1.5011236485142249e-05,
"loss": 0.3733,
"step": 2706
},
{
"epoch": 1.3381535038932146,
"grad_norm": 0.10682640730985106,
"learning_rate": 1.5007866425199443e-05,
"loss": 0.4275,
"step": 2707
},
{
"epoch": 1.3386478803608948,
"grad_norm": 0.11513661488330619,
"learning_rate": 1.5004495605943877e-05,
"loss": 0.4026,
"step": 2708
},
{
"epoch": 1.339142256828575,
"grad_norm": 0.11521333375038477,
"learning_rate": 1.5001124027886649e-05,
"loss": 0.4195,
"step": 2709
},
{
"epoch": 1.339636633296255,
"grad_norm": 0.11520257731685708,
"learning_rate": 1.499775169153897e-05,
"loss": 0.3917,
"step": 2710
},
{
"epoch": 1.3401310097639352,
"grad_norm": 0.1091473114375815,
"learning_rate": 1.4994378597412171e-05,
"loss": 0.4162,
"step": 2711
},
{
"epoch": 1.3406253862316153,
"grad_norm": 0.1121628365462019,
"learning_rate": 1.4991004746017692e-05,
"loss": 0.4147,
"step": 2712
},
{
"epoch": 1.3411197626992954,
"grad_norm": 0.11189214589763903,
"learning_rate": 1.4987630137867091e-05,
"loss": 0.395,
"step": 2713
},
{
"epoch": 1.3416141391669756,
"grad_norm": 0.10484892168533252,
"learning_rate": 1.4984254773472033e-05,
"loss": 0.4201,
"step": 2714
},
{
"epoch": 1.3421085156346557,
"grad_norm": 0.11530595459206384,
"learning_rate": 1.4980878653344318e-05,
"loss": 0.3823,
"step": 2715
},
{
"epoch": 1.3426028921023359,
"grad_norm": 0.11736969046406302,
"learning_rate": 1.4977501777995835e-05,
"loss": 0.4368,
"step": 2716
},
{
"epoch": 1.343097268570016,
"grad_norm": 0.1153202979119698,
"learning_rate": 1.49741241479386e-05,
"loss": 0.4118,
"step": 2717
},
{
"epoch": 1.3435916450376961,
"grad_norm": 0.11041532946207275,
"learning_rate": 1.4970745763684748e-05,
"loss": 0.4073,
"step": 2718
},
{
"epoch": 1.3440860215053765,
"grad_norm": 0.11549198398812172,
"learning_rate": 1.496736662574652e-05,
"loss": 0.4189,
"step": 2719
},
{
"epoch": 1.3445803979730564,
"grad_norm": 0.11094869245267547,
"learning_rate": 1.4963986734636277e-05,
"loss": 0.4204,
"step": 2720
},
{
"epoch": 1.3450747744407368,
"grad_norm": 0.10855007840341739,
"learning_rate": 1.4960606090866488e-05,
"loss": 0.4192,
"step": 2721
},
{
"epoch": 1.3455691509084167,
"grad_norm": 0.11624915705284228,
"learning_rate": 1.4957224694949744e-05,
"loss": 0.4039,
"step": 2722
},
{
"epoch": 1.346063527376097,
"grad_norm": 0.10994273963665416,
"learning_rate": 1.4953842547398743e-05,
"loss": 0.4219,
"step": 2723
},
{
"epoch": 1.346557903843777,
"grad_norm": 0.11345158107541654,
"learning_rate": 1.4950459648726298e-05,
"loss": 0.3925,
"step": 2724
},
{
"epoch": 1.3470522803114573,
"grad_norm": 0.11376958984493367,
"learning_rate": 1.4947075999445341e-05,
"loss": 0.4406,
"step": 2725
},
{
"epoch": 1.3475466567791372,
"grad_norm": 0.12324058340484667,
"learning_rate": 1.4943691600068912e-05,
"loss": 0.3733,
"step": 2726
},
{
"epoch": 1.3480410332468176,
"grad_norm": 0.10762443715240522,
"learning_rate": 1.494030645111017e-05,
"loss": 0.4079,
"step": 2727
},
{
"epoch": 1.3485354097144975,
"grad_norm": 0.11080738384904502,
"learning_rate": 1.4936920553082383e-05,
"loss": 0.4258,
"step": 2728
},
{
"epoch": 1.3490297861821778,
"grad_norm": 0.10581101081568146,
"learning_rate": 1.4933533906498937e-05,
"loss": 0.4106,
"step": 2729
},
{
"epoch": 1.3495241626498578,
"grad_norm": 0.10797422657936585,
"learning_rate": 1.4930146511873322e-05,
"loss": 0.3902,
"step": 2730
},
{
"epoch": 1.350018539117538,
"grad_norm": 0.10692922053092854,
"learning_rate": 1.4926758369719157e-05,
"loss": 0.4253,
"step": 2731
},
{
"epoch": 1.350512915585218,
"grad_norm": 0.12795495200929466,
"learning_rate": 1.492336948055016e-05,
"loss": 0.3892,
"step": 2732
},
{
"epoch": 1.3510072920528984,
"grad_norm": 0.1170703570461074,
"learning_rate": 1.4919979844880171e-05,
"loss": 0.397,
"step": 2733
},
{
"epoch": 1.3515016685205783,
"grad_norm": 0.11414113160957078,
"learning_rate": 1.4916589463223137e-05,
"loss": 0.4237,
"step": 2734
},
{
"epoch": 1.3519960449882586,
"grad_norm": 0.12782932763094587,
"learning_rate": 1.4913198336093125e-05,
"loss": 0.3889,
"step": 2735
},
{
"epoch": 1.3524904214559386,
"grad_norm": 0.10574975394586626,
"learning_rate": 1.4909806464004303e-05,
"loss": 0.4042,
"step": 2736
},
{
"epoch": 1.352984797923619,
"grad_norm": 0.10942255061532569,
"learning_rate": 1.4906413847470972e-05,
"loss": 0.4019,
"step": 2737
},
{
"epoch": 1.353479174391299,
"grad_norm": 0.11083227964706813,
"learning_rate": 1.4903020487007532e-05,
"loss": 0.4143,
"step": 2738
},
{
"epoch": 1.3539735508589792,
"grad_norm": 0.11162151706846195,
"learning_rate": 1.4899626383128487e-05,
"loss": 0.4151,
"step": 2739
},
{
"epoch": 1.3544679273266593,
"grad_norm": 0.11945235685267254,
"learning_rate": 1.4896231536348475e-05,
"loss": 0.428,
"step": 2740
},
{
"epoch": 1.3549623037943395,
"grad_norm": 0.11987601490677031,
"learning_rate": 1.4892835947182233e-05,
"loss": 0.4324,
"step": 2741
},
{
"epoch": 1.3554566802620196,
"grad_norm": 0.10671178700259294,
"learning_rate": 1.4889439616144617e-05,
"loss": 0.3932,
"step": 2742
},
{
"epoch": 1.3559510567296997,
"grad_norm": 0.10969818823170077,
"learning_rate": 1.4886042543750586e-05,
"loss": 0.3942,
"step": 2743
},
{
"epoch": 1.3564454331973799,
"grad_norm": 0.1139381593805736,
"learning_rate": 1.4882644730515223e-05,
"loss": 0.4447,
"step": 2744
},
{
"epoch": 1.35693980966506,
"grad_norm": 0.12031422817584642,
"learning_rate": 1.4879246176953715e-05,
"loss": 0.3966,
"step": 2745
},
{
"epoch": 1.3574341861327401,
"grad_norm": 0.10989369664353915,
"learning_rate": 1.4875846883581367e-05,
"loss": 0.3999,
"step": 2746
},
{
"epoch": 1.3579285626004203,
"grad_norm": 0.10864378065938537,
"learning_rate": 1.487244685091359e-05,
"loss": 0.3867,
"step": 2747
},
{
"epoch": 1.3584229390681004,
"grad_norm": 0.11036577747838187,
"learning_rate": 1.4869046079465914e-05,
"loss": 0.4038,
"step": 2748
},
{
"epoch": 1.3589173155357805,
"grad_norm": 0.1127526087879545,
"learning_rate": 1.4865644569753977e-05,
"loss": 0.4315,
"step": 2749
},
{
"epoch": 1.3594116920034607,
"grad_norm": 0.11665547619517735,
"learning_rate": 1.4862242322293525e-05,
"loss": 0.3999,
"step": 2750
},
{
"epoch": 1.3599060684711408,
"grad_norm": 0.11096233613419355,
"learning_rate": 1.485883933760043e-05,
"loss": 0.3855,
"step": 2751
},
{
"epoch": 1.360400444938821,
"grad_norm": 0.587617905998673,
"learning_rate": 1.4855435616190654e-05,
"loss": 0.4006,
"step": 2752
},
{
"epoch": 1.360894821406501,
"grad_norm": 0.11729910358832456,
"learning_rate": 1.4852031158580293e-05,
"loss": 0.4271,
"step": 2753
},
{
"epoch": 1.3613891978741812,
"grad_norm": 0.29645183653758733,
"learning_rate": 1.4848625965285542e-05,
"loss": 0.399,
"step": 2754
},
{
"epoch": 1.3618835743418614,
"grad_norm": 0.1262895334894169,
"learning_rate": 1.4845220036822705e-05,
"loss": 0.4054,
"step": 2755
},
{
"epoch": 1.3623779508095415,
"grad_norm": 0.11881126138982001,
"learning_rate": 1.4841813373708207e-05,
"loss": 0.4093,
"step": 2756
},
{
"epoch": 1.3628723272772216,
"grad_norm": 0.11866685881640857,
"learning_rate": 1.4838405976458581e-05,
"loss": 0.3904,
"step": 2757
},
{
"epoch": 1.3633667037449018,
"grad_norm": 0.2006373426175282,
"learning_rate": 1.4834997845590467e-05,
"loss": 0.4213,
"step": 2758
},
{
"epoch": 1.363861080212582,
"grad_norm": 0.12324045928459583,
"learning_rate": 1.4831588981620619e-05,
"loss": 0.388,
"step": 2759
},
{
"epoch": 1.364355456680262,
"grad_norm": 0.14835081874934108,
"learning_rate": 1.4828179385065907e-05,
"loss": 0.4276,
"step": 2760
},
{
"epoch": 1.3648498331479422,
"grad_norm": 0.1243460909361705,
"learning_rate": 1.4824769056443305e-05,
"loss": 0.4128,
"step": 2761
},
{
"epoch": 1.3653442096156223,
"grad_norm": 0.11262345832617907,
"learning_rate": 1.48213579962699e-05,
"loss": 0.4027,
"step": 2762
},
{
"epoch": 1.3658385860833024,
"grad_norm": 0.11496961250746753,
"learning_rate": 1.4817946205062887e-05,
"loss": 0.4285,
"step": 2763
},
{
"epoch": 1.3663329625509826,
"grad_norm": 0.1270097029597391,
"learning_rate": 1.4814533683339587e-05,
"loss": 0.397,
"step": 2764
},
{
"epoch": 1.3668273390186627,
"grad_norm": 0.1092487811859751,
"learning_rate": 1.481112043161741e-05,
"loss": 0.392,
"step": 2765
},
{
"epoch": 1.3673217154863428,
"grad_norm": 0.11738863939185439,
"learning_rate": 1.4807706450413885e-05,
"loss": 0.4055,
"step": 2766
},
{
"epoch": 1.367816091954023,
"grad_norm": 0.11923244501718698,
"learning_rate": 1.4804291740246665e-05,
"loss": 0.4013,
"step": 2767
},
{
"epoch": 1.3683104684217031,
"grad_norm": 0.10496762762876373,
"learning_rate": 1.4800876301633493e-05,
"loss": 0.4518,
"step": 2768
},
{
"epoch": 1.3688048448893833,
"grad_norm": 0.11816868453348596,
"learning_rate": 1.4797460135092232e-05,
"loss": 0.4339,
"step": 2769
},
{
"epoch": 1.3692992213570634,
"grad_norm": 0.12190756963792747,
"learning_rate": 1.4794043241140861e-05,
"loss": 0.4183,
"step": 2770
},
{
"epoch": 1.3697935978247435,
"grad_norm": 0.11104055079237951,
"learning_rate": 1.4790625620297454e-05,
"loss": 0.4171,
"step": 2771
},
{
"epoch": 1.3702879742924237,
"grad_norm": 0.11731225238393259,
"learning_rate": 1.4787207273080212e-05,
"loss": 0.4198,
"step": 2772
},
{
"epoch": 1.3707823507601038,
"grad_norm": 0.12177531773022533,
"learning_rate": 1.4783788200007436e-05,
"loss": 0.4112,
"step": 2773
},
{
"epoch": 1.371276727227784,
"grad_norm": 0.10835928493685673,
"learning_rate": 1.4780368401597539e-05,
"loss": 0.4242,
"step": 2774
},
{
"epoch": 1.371771103695464,
"grad_norm": 0.12852703588992428,
"learning_rate": 1.4776947878369044e-05,
"loss": 0.4318,
"step": 2775
},
{
"epoch": 1.3722654801631442,
"grad_norm": 0.3202818511820722,
"learning_rate": 1.4773526630840587e-05,
"loss": 0.4116,
"step": 2776
},
{
"epoch": 1.3727598566308243,
"grad_norm": 0.11409349668733959,
"learning_rate": 1.4770104659530905e-05,
"loss": 0.4128,
"step": 2777
},
{
"epoch": 1.3732542330985045,
"grad_norm": 0.1182935698307096,
"learning_rate": 1.476668196495886e-05,
"loss": 0.4145,
"step": 2778
},
{
"epoch": 1.3737486095661846,
"grad_norm": 0.12741765687980006,
"learning_rate": 1.4763258547643409e-05,
"loss": 0.4044,
"step": 2779
},
{
"epoch": 1.3742429860338647,
"grad_norm": 0.11233123320498495,
"learning_rate": 1.4759834408103623e-05,
"loss": 0.459,
"step": 2780
},
{
"epoch": 1.3747373625015449,
"grad_norm": 0.11693171743184896,
"learning_rate": 1.475640954685869e-05,
"loss": 0.4123,
"step": 2781
},
{
"epoch": 1.375231738969225,
"grad_norm": 0.11989595362422613,
"learning_rate": 1.4752983964427891e-05,
"loss": 0.4203,
"step": 2782
},
{
"epoch": 1.3757261154369052,
"grad_norm": 0.11308581557897958,
"learning_rate": 1.4749557661330637e-05,
"loss": 0.403,
"step": 2783
},
{
"epoch": 1.3762204919045853,
"grad_norm": 0.11842210712737376,
"learning_rate": 1.4746130638086436e-05,
"loss": 0.4059,
"step": 2784
},
{
"epoch": 1.3767148683722654,
"grad_norm": 0.1092613417357397,
"learning_rate": 1.47427028952149e-05,
"loss": 0.3797,
"step": 2785
},
{
"epoch": 1.3772092448399456,
"grad_norm": 0.11105594493777624,
"learning_rate": 1.4739274433235764e-05,
"loss": 0.4399,
"step": 2786
},
{
"epoch": 1.3777036213076257,
"grad_norm": 0.11273296933405744,
"learning_rate": 1.4735845252668863e-05,
"loss": 0.4152,
"step": 2787
},
{
"epoch": 1.3781979977753058,
"grad_norm": 0.11898792606751946,
"learning_rate": 1.473241535403414e-05,
"loss": 0.3971,
"step": 2788
},
{
"epoch": 1.378692374242986,
"grad_norm": 0.10521662679716581,
"learning_rate": 1.4728984737851658e-05,
"loss": 0.4053,
"step": 2789
},
{
"epoch": 1.379186750710666,
"grad_norm": 0.12569128517162714,
"learning_rate": 1.472555340464157e-05,
"loss": 0.4243,
"step": 2790
},
{
"epoch": 1.3796811271783462,
"grad_norm": 0.11478219450955242,
"learning_rate": 1.4722121354924157e-05,
"loss": 0.4008,
"step": 2791
},
{
"epoch": 1.3801755036460264,
"grad_norm": 0.1157061934327342,
"learning_rate": 1.4718688589219797e-05,
"loss": 0.4006,
"step": 2792
},
{
"epoch": 1.3806698801137065,
"grad_norm": 0.11614347268365213,
"learning_rate": 1.4715255108048978e-05,
"loss": 0.4043,
"step": 2793
},
{
"epoch": 1.3811642565813869,
"grad_norm": 0.11342161726878353,
"learning_rate": 1.4711820911932302e-05,
"loss": 0.4083,
"step": 2794
},
{
"epoch": 1.3816586330490668,
"grad_norm": 0.11492312930516928,
"learning_rate": 1.4708386001390475e-05,
"loss": 0.3973,
"step": 2795
},
{
"epoch": 1.3821530095167471,
"grad_norm": 0.11999395393274657,
"learning_rate": 1.4704950376944304e-05,
"loss": 0.3925,
"step": 2796
},
{
"epoch": 1.382647385984427,
"grad_norm": 0.11097863864491724,
"learning_rate": 1.4701514039114728e-05,
"loss": 0.3998,
"step": 2797
},
{
"epoch": 1.3831417624521074,
"grad_norm": 0.1176993116307161,
"learning_rate": 1.4698076988422765e-05,
"loss": 0.3941,
"step": 2798
},
{
"epoch": 1.3836361389197873,
"grad_norm": 0.1132917837864325,
"learning_rate": 1.4694639225389553e-05,
"loss": 0.4036,
"step": 2799
},
{
"epoch": 1.3841305153874677,
"grad_norm": 0.11887460688384771,
"learning_rate": 1.4691200750536351e-05,
"loss": 0.4134,
"step": 2800
},
{
"epoch": 1.3846248918551476,
"grad_norm": 0.11835961713542129,
"learning_rate": 1.4687761564384506e-05,
"loss": 0.421,
"step": 2801
},
{
"epoch": 1.385119268322828,
"grad_norm": 0.1224845354428046,
"learning_rate": 1.4684321667455483e-05,
"loss": 0.4028,
"step": 2802
},
{
"epoch": 1.3856136447905079,
"grad_norm": 0.7465596001051085,
"learning_rate": 1.4680881060270855e-05,
"loss": 0.4691,
"step": 2803
},
{
"epoch": 1.3861080212581882,
"grad_norm": 0.125651381728338,
"learning_rate": 1.4677439743352296e-05,
"loss": 0.4269,
"step": 2804
},
{
"epoch": 1.3866023977258681,
"grad_norm": 0.116639410897628,
"learning_rate": 1.4673997717221595e-05,
"loss": 0.3933,
"step": 2805
},
{
"epoch": 1.3870967741935485,
"grad_norm": 0.12063162582057589,
"learning_rate": 1.467055498240065e-05,
"loss": 0.3972,
"step": 2806
},
{
"epoch": 1.3875911506612284,
"grad_norm": 0.12530949339402062,
"learning_rate": 1.4667111539411454e-05,
"loss": 0.4164,
"step": 2807
},
{
"epoch": 1.3880855271289088,
"grad_norm": 0.11893697031744939,
"learning_rate": 1.4663667388776122e-05,
"loss": 0.4381,
"step": 2808
},
{
"epoch": 1.3885799035965887,
"grad_norm": 0.14121217036636585,
"learning_rate": 1.4660222531016865e-05,
"loss": 0.3892,
"step": 2809
},
{
"epoch": 1.389074280064269,
"grad_norm": 0.12586490396104205,
"learning_rate": 1.465677696665601e-05,
"loss": 0.3879,
"step": 2810
},
{
"epoch": 1.389568656531949,
"grad_norm": 0.11829035691698647,
"learning_rate": 1.4653330696215986e-05,
"loss": 0.4439,
"step": 2811
},
{
"epoch": 1.3900630329996293,
"grad_norm": 0.12367186707561684,
"learning_rate": 1.4649883720219329e-05,
"loss": 0.4105,
"step": 2812
},
{
"epoch": 1.3905574094673094,
"grad_norm": 0.12487047427242465,
"learning_rate": 1.464643603918869e-05,
"loss": 0.4122,
"step": 2813
},
{
"epoch": 1.3910517859349896,
"grad_norm": 0.11467330688726436,
"learning_rate": 1.4642987653646809e-05,
"loss": 0.4152,
"step": 2814
},
{
"epoch": 1.3915461624026697,
"grad_norm": 0.11874069069136393,
"learning_rate": 1.4639538564116552e-05,
"loss": 0.3948,
"step": 2815
},
{
"epoch": 1.3920405388703498,
"grad_norm": 0.11297289291486129,
"learning_rate": 1.4636088771120881e-05,
"loss": 0.4176,
"step": 2816
},
{
"epoch": 1.39253491533803,
"grad_norm": 0.11059744892183039,
"learning_rate": 1.463263827518287e-05,
"loss": 0.421,
"step": 2817
},
{
"epoch": 1.3930292918057101,
"grad_norm": 0.11765573501567624,
"learning_rate": 1.462918707682569e-05,
"loss": 0.4157,
"step": 2818
},
{
"epoch": 1.3935236682733902,
"grad_norm": 0.11661500984104699,
"learning_rate": 1.4625735176572633e-05,
"loss": 0.4,
"step": 2819
},
{
"epoch": 1.3940180447410704,
"grad_norm": 0.10729989911165451,
"learning_rate": 1.462228257494709e-05,
"loss": 0.3996,
"step": 2820
},
{
"epoch": 1.3945124212087505,
"grad_norm": 0.10974082943272646,
"learning_rate": 1.4618829272472553e-05,
"loss": 0.4095,
"step": 2821
},
{
"epoch": 1.3950067976764307,
"grad_norm": 0.11596441740719332,
"learning_rate": 1.461537526967263e-05,
"loss": 0.4178,
"step": 2822
},
{
"epoch": 1.3955011741441108,
"grad_norm": 0.10742209268236166,
"learning_rate": 1.4611920567071028e-05,
"loss": 0.3888,
"step": 2823
},
{
"epoch": 1.395995550611791,
"grad_norm": 0.11020309572297896,
"learning_rate": 1.4608465165191564e-05,
"loss": 0.4107,
"step": 2824
},
{
"epoch": 1.396489927079471,
"grad_norm": 0.109896706410616,
"learning_rate": 1.460500906455816e-05,
"loss": 0.3841,
"step": 2825
},
{
"epoch": 1.3969843035471512,
"grad_norm": 0.1057116554632528,
"learning_rate": 1.4601552265694843e-05,
"loss": 0.4192,
"step": 2826
},
{
"epoch": 1.3974786800148313,
"grad_norm": 0.11603019392290628,
"learning_rate": 1.4598094769125747e-05,
"loss": 0.3994,
"step": 2827
},
{
"epoch": 1.3979730564825115,
"grad_norm": 0.10776930545879025,
"learning_rate": 1.4594636575375115e-05,
"loss": 0.4134,
"step": 2828
},
{
"epoch": 1.3984674329501916,
"grad_norm": 0.10765856694873215,
"learning_rate": 1.4591177684967286e-05,
"loss": 0.4097,
"step": 2829
},
{
"epoch": 1.3989618094178717,
"grad_norm": 0.11113254437348995,
"learning_rate": 1.4587718098426713e-05,
"loss": 0.404,
"step": 2830
},
{
"epoch": 1.3994561858855519,
"grad_norm": 0.10609548140631643,
"learning_rate": 1.4584257816277951e-05,
"loss": 0.3916,
"step": 2831
},
{
"epoch": 1.399950562353232,
"grad_norm": 0.11633329512957764,
"learning_rate": 1.4580796839045667e-05,
"loss": 0.4349,
"step": 2832
},
{
"epoch": 1.4004449388209121,
"grad_norm": 0.10923015260261083,
"learning_rate": 1.4577335167254627e-05,
"loss": 0.4239,
"step": 2833
},
{
"epoch": 1.4009393152885923,
"grad_norm": 0.12507451133947337,
"learning_rate": 1.4573872801429701e-05,
"loss": 0.4106,
"step": 2834
},
{
"epoch": 1.4014336917562724,
"grad_norm": 0.11081421387002044,
"learning_rate": 1.4570409742095865e-05,
"loss": 0.3892,
"step": 2835
},
{
"epoch": 1.4019280682239526,
"grad_norm": 0.10699336401367554,
"learning_rate": 1.4566945989778207e-05,
"loss": 0.4174,
"step": 2836
},
{
"epoch": 1.4024224446916327,
"grad_norm": 0.12278171117998112,
"learning_rate": 1.456348154500191e-05,
"loss": 0.3907,
"step": 2837
},
{
"epoch": 1.4029168211593128,
"grad_norm": 0.11205667103365849,
"learning_rate": 1.456001640829227e-05,
"loss": 0.3951,
"step": 2838
},
{
"epoch": 1.403411197626993,
"grad_norm": 0.11811192508000119,
"learning_rate": 1.4556550580174684e-05,
"loss": 0.4051,
"step": 2839
},
{
"epoch": 1.403905574094673,
"grad_norm": 0.1297001649544432,
"learning_rate": 1.4553084061174657e-05,
"loss": 0.4485,
"step": 2840
},
{
"epoch": 1.4043999505623532,
"grad_norm": 0.11468292366071114,
"learning_rate": 1.4549616851817791e-05,
"loss": 0.41,
"step": 2841
},
{
"epoch": 1.4048943270300334,
"grad_norm": 0.10856083882305868,
"learning_rate": 1.4546148952629805e-05,
"loss": 0.4146,
"step": 2842
},
{
"epoch": 1.4053887034977135,
"grad_norm": 0.10986616181529268,
"learning_rate": 1.454268036413651e-05,
"loss": 0.4054,
"step": 2843
},
{
"epoch": 1.4058830799653936,
"grad_norm": 0.16640899554265443,
"learning_rate": 1.4539211086863832e-05,
"loss": 0.407,
"step": 2844
},
{
"epoch": 1.4063774564330738,
"grad_norm": 0.12307310949486826,
"learning_rate": 1.4535741121337789e-05,
"loss": 0.416,
"step": 2845
},
{
"epoch": 1.406871832900754,
"grad_norm": 0.11111386948652419,
"learning_rate": 1.4532270468084524e-05,
"loss": 0.4145,
"step": 2846
},
{
"epoch": 1.407366209368434,
"grad_norm": 0.12574083586802873,
"learning_rate": 1.452879912763026e-05,
"loss": 0.4241,
"step": 2847
},
{
"epoch": 1.4078605858361142,
"grad_norm": 0.11132093620555257,
"learning_rate": 1.4525327100501337e-05,
"loss": 0.4155,
"step": 2848
},
{
"epoch": 1.4083549623037943,
"grad_norm": 0.1271866963971104,
"learning_rate": 1.4521854387224205e-05,
"loss": 0.4097,
"step": 2849
},
{
"epoch": 1.4088493387714744,
"grad_norm": 0.12282008085432035,
"learning_rate": 1.4518380988325405e-05,
"loss": 0.4193,
"step": 2850
},
{
"epoch": 1.4093437152391546,
"grad_norm": 0.11741021736842171,
"learning_rate": 1.4514906904331582e-05,
"loss": 0.3931,
"step": 2851
},
{
"epoch": 1.4098380917068347,
"grad_norm": 0.11053690363453336,
"learning_rate": 1.4511432135769504e-05,
"loss": 0.4426,
"step": 2852
},
{
"epoch": 1.4103324681745149,
"grad_norm": 0.31316548499757746,
"learning_rate": 1.4507956683166018e-05,
"loss": 0.4037,
"step": 2853
},
{
"epoch": 1.410826844642195,
"grad_norm": 0.10661543603426658,
"learning_rate": 1.4504480547048092e-05,
"loss": 0.3794,
"step": 2854
},
{
"epoch": 1.4113212211098751,
"grad_norm": 0.12162862584549658,
"learning_rate": 1.450100372794279e-05,
"loss": 0.4349,
"step": 2855
},
{
"epoch": 1.4118155975775553,
"grad_norm": 0.10260504016737086,
"learning_rate": 1.4497526226377281e-05,
"loss": 0.4179,
"step": 2856
},
{
"epoch": 1.4123099740452354,
"grad_norm": 0.11813996824730387,
"learning_rate": 1.4494048042878839e-05,
"loss": 0.4162,
"step": 2857
},
{
"epoch": 1.4128043505129155,
"grad_norm": 0.1089754020551346,
"learning_rate": 1.4490569177974836e-05,
"loss": 0.4153,
"step": 2858
},
{
"epoch": 1.4132987269805957,
"grad_norm": 0.11128783194596363,
"learning_rate": 1.4487089632192756e-05,
"loss": 0.4055,
"step": 2859
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.11533754867013858,
"learning_rate": 1.4483609406060181e-05,
"loss": 0.407,
"step": 2860
},
{
"epoch": 1.414287479915956,
"grad_norm": 0.10955391582248099,
"learning_rate": 1.4480128500104795e-05,
"loss": 0.3839,
"step": 2861
},
{
"epoch": 1.414781856383636,
"grad_norm": 0.11097482328874847,
"learning_rate": 1.4476646914854388e-05,
"loss": 0.4141,
"step": 2862
},
{
"epoch": 1.4152762328513162,
"grad_norm": 0.11071364958960463,
"learning_rate": 1.447316465083685e-05,
"loss": 0.3965,
"step": 2863
},
{
"epoch": 1.4157706093189963,
"grad_norm": 0.1177194110117628,
"learning_rate": 1.4469681708580177e-05,
"loss": 0.3943,
"step": 2864
},
{
"epoch": 1.4162649857866765,
"grad_norm": 0.1086287834432703,
"learning_rate": 1.4466198088612469e-05,
"loss": 0.4274,
"step": 2865
},
{
"epoch": 1.4167593622543566,
"grad_norm": 0.11963713498072244,
"learning_rate": 1.4462713791461926e-05,
"loss": 0.4464,
"step": 2866
},
{
"epoch": 1.4172537387220367,
"grad_norm": 0.12230734277068044,
"learning_rate": 1.4459228817656847e-05,
"loss": 0.389,
"step": 2867
},
{
"epoch": 1.4177481151897169,
"grad_norm": 0.10498429008213403,
"learning_rate": 1.445574316772564e-05,
"loss": 0.4224,
"step": 2868
},
{
"epoch": 1.4182424916573972,
"grad_norm": 0.11511524355972982,
"learning_rate": 1.4452256842196816e-05,
"loss": 0.3998,
"step": 2869
},
{
"epoch": 1.4187368681250772,
"grad_norm": 0.11765877859041557,
"learning_rate": 1.4448769841598982e-05,
"loss": 0.4198,
"step": 2870
},
{
"epoch": 1.4192312445927575,
"grad_norm": 0.11781688335703294,
"learning_rate": 1.4445282166460852e-05,
"loss": 0.3985,
"step": 2871
},
{
"epoch": 1.4197256210604374,
"grad_norm": 0.10494841799321314,
"learning_rate": 1.444179381731124e-05,
"loss": 0.3947,
"step": 2872
},
{
"epoch": 1.4202199975281178,
"grad_norm": 0.1125400984801321,
"learning_rate": 1.4438304794679069e-05,
"loss": 0.4042,
"step": 2873
},
{
"epoch": 1.4207143739957977,
"grad_norm": 0.1029484717242306,
"learning_rate": 1.4434815099093352e-05,
"loss": 0.3747,
"step": 2874
},
{
"epoch": 1.421208750463478,
"grad_norm": 0.11142001797866657,
"learning_rate": 1.443132473108321e-05,
"loss": 0.4315,
"step": 2875
},
{
"epoch": 1.421703126931158,
"grad_norm": 0.11415059762505915,
"learning_rate": 1.4427833691177873e-05,
"loss": 0.4132,
"step": 2876
},
{
"epoch": 1.4221975033988383,
"grad_norm": 0.10884680093301133,
"learning_rate": 1.4424341979906662e-05,
"loss": 0.3713,
"step": 2877
},
{
"epoch": 1.4226918798665182,
"grad_norm": 0.1022135723505019,
"learning_rate": 1.4420849597799005e-05,
"loss": 0.399,
"step": 2878
},
{
"epoch": 1.4231862563341986,
"grad_norm": 0.11410820361660054,
"learning_rate": 1.4417356545384433e-05,
"loss": 0.4135,
"step": 2879
},
{
"epoch": 1.4236806328018785,
"grad_norm": 0.1206315070825568,
"learning_rate": 1.4413862823192575e-05,
"loss": 0.3827,
"step": 2880
},
{
"epoch": 1.4241750092695589,
"grad_norm": 0.1198722680494045,
"learning_rate": 1.4410368431753158e-05,
"loss": 0.4096,
"step": 2881
},
{
"epoch": 1.4246693857372388,
"grad_norm": 0.11504067646709962,
"learning_rate": 1.4406873371596026e-05,
"loss": 0.3935,
"step": 2882
},
{
"epoch": 1.4251637622049191,
"grad_norm": 0.12217698950783544,
"learning_rate": 1.4403377643251107e-05,
"loss": 0.4495,
"step": 2883
},
{
"epoch": 1.425658138672599,
"grad_norm": 0.44829519656777256,
"learning_rate": 1.4399881247248437e-05,
"loss": 0.4286,
"step": 2884
},
{
"epoch": 1.4261525151402794,
"grad_norm": 0.1280849940204612,
"learning_rate": 1.4396384184118159e-05,
"loss": 0.4072,
"step": 2885
},
{
"epoch": 1.4266468916079595,
"grad_norm": 0.11994099035031566,
"learning_rate": 1.4392886454390507e-05,
"loss": 0.4092,
"step": 2886
},
{
"epoch": 1.4271412680756397,
"grad_norm": 0.11947017309446185,
"learning_rate": 1.4389388058595822e-05,
"loss": 0.419,
"step": 2887
},
{
"epoch": 1.4276356445433198,
"grad_norm": 0.118208731980222,
"learning_rate": 1.4385888997264543e-05,
"loss": 0.4038,
"step": 2888
},
{
"epoch": 1.428130021011,
"grad_norm": 0.11303025208138046,
"learning_rate": 1.4382389270927215e-05,
"loss": 0.396,
"step": 2889
},
{
"epoch": 1.42862439747868,
"grad_norm": 0.11395671057295233,
"learning_rate": 1.437888888011448e-05,
"loss": 0.3879,
"step": 2890
},
{
"epoch": 1.4291187739463602,
"grad_norm": 0.1143881869896609,
"learning_rate": 1.4375387825357078e-05,
"loss": 0.4516,
"step": 2891
},
{
"epoch": 1.4296131504140404,
"grad_norm": 0.24650203746067786,
"learning_rate": 1.4371886107185857e-05,
"loss": 0.3786,
"step": 2892
},
{
"epoch": 1.4301075268817205,
"grad_norm": 0.10776228719189815,
"learning_rate": 1.436838372613176e-05,
"loss": 0.4151,
"step": 2893
},
{
"epoch": 1.4306019033494006,
"grad_norm": 0.11204954944058797,
"learning_rate": 1.436488068272583e-05,
"loss": 0.3993,
"step": 2894
},
{
"epoch": 1.4310962798170808,
"grad_norm": 0.12038505854506039,
"learning_rate": 1.4361376977499217e-05,
"loss": 0.4162,
"step": 2895
},
{
"epoch": 1.431590656284761,
"grad_norm": 0.11723820729287467,
"learning_rate": 1.4357872610983163e-05,
"loss": 0.4117,
"step": 2896
},
{
"epoch": 1.432085032752441,
"grad_norm": 0.10482343930762773,
"learning_rate": 1.4354367583709012e-05,
"loss": 0.421,
"step": 2897
},
{
"epoch": 1.4325794092201212,
"grad_norm": 0.1127345517049002,
"learning_rate": 1.435086189620822e-05,
"loss": 0.4035,
"step": 2898
},
{
"epoch": 1.4330737856878013,
"grad_norm": 0.11152534421000401,
"learning_rate": 1.434735554901232e-05,
"loss": 0.4225,
"step": 2899
},
{
"epoch": 1.4335681621554814,
"grad_norm": 0.11128402128620238,
"learning_rate": 1.4343848542652967e-05,
"loss": 0.4042,
"step": 2900
},
{
"epoch": 1.4340625386231616,
"grad_norm": 0.11221459151259064,
"learning_rate": 1.4340340877661908e-05,
"loss": 0.4028,
"step": 2901
},
{
"epoch": 1.4345569150908417,
"grad_norm": 0.11562996344665326,
"learning_rate": 1.4336832554570987e-05,
"loss": 0.3975,
"step": 2902
},
{
"epoch": 1.4350512915585218,
"grad_norm": 0.10653901633121628,
"learning_rate": 1.4333323573912146e-05,
"loss": 0.3881,
"step": 2903
},
{
"epoch": 1.435545668026202,
"grad_norm": 0.11390614737508101,
"learning_rate": 1.4329813936217436e-05,
"loss": 0.3971,
"step": 2904
},
{
"epoch": 1.4360400444938821,
"grad_norm": 0.11039481337645692,
"learning_rate": 1.4326303642019002e-05,
"loss": 0.4172,
"step": 2905
},
{
"epoch": 1.4365344209615623,
"grad_norm": 0.11294572529702746,
"learning_rate": 1.4322792691849087e-05,
"loss": 0.4052,
"step": 2906
},
{
"epoch": 1.4370287974292424,
"grad_norm": 0.11166977891523969,
"learning_rate": 1.4319281086240036e-05,
"loss": 0.3885,
"step": 2907
},
{
"epoch": 1.4375231738969225,
"grad_norm": 0.11336193020819145,
"learning_rate": 1.4315768825724292e-05,
"loss": 0.4062,
"step": 2908
},
{
"epoch": 1.4380175503646027,
"grad_norm": 0.10765535775395299,
"learning_rate": 1.43122559108344e-05,
"loss": 0.4351,
"step": 2909
},
{
"epoch": 1.4385119268322828,
"grad_norm": 0.11530304692062775,
"learning_rate": 1.4308742342103001e-05,
"loss": 0.4152,
"step": 2910
},
{
"epoch": 1.439006303299963,
"grad_norm": 0.10824849993678459,
"learning_rate": 1.4305228120062836e-05,
"loss": 0.42,
"step": 2911
},
{
"epoch": 1.439500679767643,
"grad_norm": 0.11203902580755444,
"learning_rate": 1.4301713245246744e-05,
"loss": 0.4246,
"step": 2912
},
{
"epoch": 1.4399950562353232,
"grad_norm": 0.10667057610025463,
"learning_rate": 1.4298197718187665e-05,
"loss": 0.3906,
"step": 2913
},
{
"epoch": 1.4404894327030033,
"grad_norm": 0.11196975703691253,
"learning_rate": 1.4294681539418642e-05,
"loss": 0.4062,
"step": 2914
},
{
"epoch": 1.4409838091706835,
"grad_norm": 0.11311614528553453,
"learning_rate": 1.4291164709472809e-05,
"loss": 0.4368,
"step": 2915
},
{
"epoch": 1.4414781856383636,
"grad_norm": 1.1721248579196228,
"learning_rate": 1.4287647228883397e-05,
"loss": 0.3886,
"step": 2916
},
{
"epoch": 1.4419725621060437,
"grad_norm": 0.10687271098987232,
"learning_rate": 1.4284129098183745e-05,
"loss": 0.4078,
"step": 2917
},
{
"epoch": 1.4424669385737239,
"grad_norm": 0.1151096986779402,
"learning_rate": 1.428061031790729e-05,
"loss": 0.3794,
"step": 2918
},
{
"epoch": 1.442961315041404,
"grad_norm": 0.11611404141889504,
"learning_rate": 1.4277090888587557e-05,
"loss": 0.4298,
"step": 2919
},
{
"epoch": 1.4434556915090841,
"grad_norm": 0.11419822112887222,
"learning_rate": 1.4273570810758176e-05,
"loss": 0.3939,
"step": 2920
},
{
"epoch": 1.4439500679767643,
"grad_norm": 0.11366840267978472,
"learning_rate": 1.4270050084952882e-05,
"loss": 0.4099,
"step": 2921
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.11045208787249548,
"learning_rate": 1.4266528711705496e-05,
"loss": 0.4166,
"step": 2922
},
{
"epoch": 1.4449388209121246,
"grad_norm": 0.1107361298386549,
"learning_rate": 1.4263006691549943e-05,
"loss": 0.4082,
"step": 2923
},
{
"epoch": 1.4454331973798047,
"grad_norm": 0.10644761860952608,
"learning_rate": 1.4259484025020248e-05,
"loss": 0.4202,
"step": 2924
},
{
"epoch": 1.4459275738474848,
"grad_norm": 0.11421718525895658,
"learning_rate": 1.4255960712650527e-05,
"loss": 0.392,
"step": 2925
},
{
"epoch": 1.446421950315165,
"grad_norm": 0.1112293811456956,
"learning_rate": 1.4252436754975005e-05,
"loss": 0.4062,
"step": 2926
},
{
"epoch": 1.446916326782845,
"grad_norm": 0.1086738223868148,
"learning_rate": 1.4248912152527994e-05,
"loss": 0.4099,
"step": 2927
},
{
"epoch": 1.4474107032505252,
"grad_norm": 0.113814390948515,
"learning_rate": 1.4245386905843914e-05,
"loss": 0.4066,
"step": 2928
},
{
"epoch": 1.4479050797182054,
"grad_norm": 0.1067237080463405,
"learning_rate": 1.4241861015457272e-05,
"loss": 0.4192,
"step": 2929
},
{
"epoch": 1.4483994561858855,
"grad_norm": 0.11316926160108683,
"learning_rate": 1.4238334481902673e-05,
"loss": 0.4015,
"step": 2930
},
{
"epoch": 1.4488938326535656,
"grad_norm": 0.1072938736470279,
"learning_rate": 1.423480730571484e-05,
"loss": 0.3825,
"step": 2931
},
{
"epoch": 1.4493882091212458,
"grad_norm": 0.11596075095953415,
"learning_rate": 1.423127948742856e-05,
"loss": 0.3945,
"step": 2932
},
{
"epoch": 1.449882585588926,
"grad_norm": 0.10565169762858023,
"learning_rate": 1.4227751027578743e-05,
"loss": 0.4367,
"step": 2933
},
{
"epoch": 1.450376962056606,
"grad_norm": 0.11677927234644302,
"learning_rate": 1.4224221926700388e-05,
"loss": 0.4155,
"step": 2934
},
{
"epoch": 1.4508713385242862,
"grad_norm": 0.11146702792998332,
"learning_rate": 1.4220692185328592e-05,
"loss": 0.4068,
"step": 2935
},
{
"epoch": 1.4513657149919663,
"grad_norm": 0.10366055402067666,
"learning_rate": 1.4217161803998549e-05,
"loss": 0.3942,
"step": 2936
},
{
"epoch": 1.4518600914596465,
"grad_norm": 0.10891757379013021,
"learning_rate": 1.4213630783245547e-05,
"loss": 0.4157,
"step": 2937
},
{
"epoch": 1.4523544679273266,
"grad_norm": 0.1132125955155006,
"learning_rate": 1.4210099123604974e-05,
"loss": 0.4035,
"step": 2938
},
{
"epoch": 1.4528488443950067,
"grad_norm": 0.11189188909629053,
"learning_rate": 1.4206566825612315e-05,
"loss": 0.4079,
"step": 2939
},
{
"epoch": 1.4533432208626869,
"grad_norm": 0.11172004443278795,
"learning_rate": 1.4203033889803152e-05,
"loss": 0.4164,
"step": 2940
},
{
"epoch": 1.453837597330367,
"grad_norm": 0.10804345689472573,
"learning_rate": 1.4199500316713161e-05,
"loss": 0.406,
"step": 2941
},
{
"epoch": 1.4543319737980474,
"grad_norm": 0.1083602594759408,
"learning_rate": 1.4195966106878119e-05,
"loss": 0.4334,
"step": 2942
},
{
"epoch": 1.4548263502657273,
"grad_norm": 0.11497362645237542,
"learning_rate": 1.4192431260833894e-05,
"loss": 0.4105,
"step": 2943
},
{
"epoch": 1.4553207267334076,
"grad_norm": 0.11415937897030252,
"learning_rate": 1.4188895779116457e-05,
"loss": 0.4329,
"step": 2944
},
{
"epoch": 1.4558151032010875,
"grad_norm": 0.17389712859622575,
"learning_rate": 1.4185359662261869e-05,
"loss": 0.4109,
"step": 2945
},
{
"epoch": 1.456309479668768,
"grad_norm": 0.1256960950504167,
"learning_rate": 1.4181822910806289e-05,
"loss": 0.4321,
"step": 2946
},
{
"epoch": 1.4568038561364478,
"grad_norm": 0.12585582944273188,
"learning_rate": 1.4178285525285978e-05,
"loss": 0.4274,
"step": 2947
},
{
"epoch": 1.4572982326041282,
"grad_norm": 0.11869063310025431,
"learning_rate": 1.4174747506237285e-05,
"loss": 0.4088,
"step": 2948
},
{
"epoch": 1.457792609071808,
"grad_norm": 0.1384135264498906,
"learning_rate": 1.4171208854196658e-05,
"loss": 0.4336,
"step": 2949
},
{
"epoch": 1.4582869855394884,
"grad_norm": 0.11400057387113918,
"learning_rate": 1.4167669569700645e-05,
"loss": 0.3965,
"step": 2950
},
{
"epoch": 1.4587813620071683,
"grad_norm": 0.11995299922338916,
"learning_rate": 1.416412965328589e-05,
"loss": 0.403,
"step": 2951
},
{
"epoch": 1.4592757384748487,
"grad_norm": 0.1145377192702129,
"learning_rate": 1.4160589105489117e-05,
"loss": 0.4067,
"step": 2952
},
{
"epoch": 1.4597701149425286,
"grad_norm": 0.1118083501058736,
"learning_rate": 1.415704792684717e-05,
"loss": 0.4082,
"step": 2953
},
{
"epoch": 1.460264491410209,
"grad_norm": 0.11977404753094022,
"learning_rate": 1.4153506117896968e-05,
"loss": 0.4095,
"step": 2954
},
{
"epoch": 1.460758867877889,
"grad_norm": 0.10898390534928408,
"learning_rate": 1.4149963679175541e-05,
"loss": 0.4171,
"step": 2955
},
{
"epoch": 1.4612532443455692,
"grad_norm": 0.1064322530112939,
"learning_rate": 1.4146420611220002e-05,
"loss": 0.4453,
"step": 2956
},
{
"epoch": 1.4617476208132492,
"grad_norm": 0.12036065099464745,
"learning_rate": 1.4142876914567571e-05,
"loss": 0.3963,
"step": 2957
},
{
"epoch": 1.4622419972809295,
"grad_norm": 0.10715964824209845,
"learning_rate": 1.4139332589755552e-05,
"loss": 0.4081,
"step": 2958
},
{
"epoch": 1.4627363737486094,
"grad_norm": 0.10639189261410946,
"learning_rate": 1.4135787637321354e-05,
"loss": 0.4018,
"step": 2959
},
{
"epoch": 1.4632307502162898,
"grad_norm": 0.1145668355124029,
"learning_rate": 1.4132242057802472e-05,
"loss": 0.3992,
"step": 2960
},
{
"epoch": 1.46372512668397,
"grad_norm": 0.12426008531997886,
"learning_rate": 1.412869585173651e-05,
"loss": 0.4074,
"step": 2961
},
{
"epoch": 1.46421950315165,
"grad_norm": 0.11248834937869256,
"learning_rate": 1.4125149019661146e-05,
"loss": 0.4174,
"step": 2962
},
{
"epoch": 1.4647138796193302,
"grad_norm": 0.10564043908393983,
"learning_rate": 1.4121601562114174e-05,
"loss": 0.41,
"step": 2963
},
{
"epoch": 1.4652082560870103,
"grad_norm": 0.11641730843960371,
"learning_rate": 1.4118053479633471e-05,
"loss": 0.393,
"step": 2964
},
{
"epoch": 1.4657026325546905,
"grad_norm": 0.23678745572141852,
"learning_rate": 1.4114504772757007e-05,
"loss": 0.44,
"step": 2965
},
{
"epoch": 1.4661970090223706,
"grad_norm": 0.11809342990666215,
"learning_rate": 1.411095544202286e-05,
"loss": 0.4171,
"step": 2966
},
{
"epoch": 1.4666913854900507,
"grad_norm": 0.11164971139536689,
"learning_rate": 1.4107405487969188e-05,
"loss": 0.4034,
"step": 2967
},
{
"epoch": 1.4671857619577309,
"grad_norm": 0.11417415447415827,
"learning_rate": 1.4103854911134247e-05,
"loss": 0.4564,
"step": 2968
},
{
"epoch": 1.467680138425411,
"grad_norm": 0.12450460073234565,
"learning_rate": 1.4100303712056395e-05,
"loss": 0.4159,
"step": 2969
},
{
"epoch": 1.4681745148930911,
"grad_norm": 0.11350103403932407,
"learning_rate": 1.4096751891274079e-05,
"loss": 0.4065,
"step": 2970
},
{
"epoch": 1.4686688913607713,
"grad_norm": 0.11978476033993743,
"learning_rate": 1.4093199449325837e-05,
"loss": 0.4054,
"step": 2971
},
{
"epoch": 1.4691632678284514,
"grad_norm": 0.12650373149130303,
"learning_rate": 1.4089646386750304e-05,
"loss": 0.4102,
"step": 2972
},
{
"epoch": 1.4696576442961315,
"grad_norm": 0.11443093208075576,
"learning_rate": 1.4086092704086212e-05,
"loss": 0.4183,
"step": 2973
},
{
"epoch": 1.4701520207638117,
"grad_norm": 0.11806754809939887,
"learning_rate": 1.4082538401872383e-05,
"loss": 0.3735,
"step": 2974
},
{
"epoch": 1.4706463972314918,
"grad_norm": 0.1141994912992687,
"learning_rate": 1.4078983480647738e-05,
"loss": 0.409,
"step": 2975
},
{
"epoch": 1.471140773699172,
"grad_norm": 0.12112991702206397,
"learning_rate": 1.4075427940951282e-05,
"loss": 0.3993,
"step": 2976
},
{
"epoch": 1.471635150166852,
"grad_norm": 0.11947086792964265,
"learning_rate": 1.4071871783322128e-05,
"loss": 0.4288,
"step": 2977
},
{
"epoch": 1.4721295266345322,
"grad_norm": 0.11514769627345331,
"learning_rate": 1.406831500829947e-05,
"loss": 0.4122,
"step": 2978
},
{
"epoch": 1.4726239031022124,
"grad_norm": 0.12272231342671681,
"learning_rate": 1.4064757616422597e-05,
"loss": 0.3911,
"step": 2979
},
{
"epoch": 1.4731182795698925,
"grad_norm": 0.11387802777402381,
"learning_rate": 1.4061199608230904e-05,
"loss": 0.4105,
"step": 2980
},
{
"epoch": 1.4736126560375726,
"grad_norm": 0.1213421408642651,
"learning_rate": 1.4057640984263865e-05,
"loss": 0.4144,
"step": 2981
},
{
"epoch": 1.4741070325052528,
"grad_norm": 0.12358434876731178,
"learning_rate": 1.405408174506105e-05,
"loss": 0.4362,
"step": 2982
},
{
"epoch": 1.474601408972933,
"grad_norm": 1.5326501011786684,
"learning_rate": 1.405052189116213e-05,
"loss": 0.4269,
"step": 2983
},
{
"epoch": 1.475095785440613,
"grad_norm": 0.13180027539735004,
"learning_rate": 1.4046961423106868e-05,
"loss": 0.4081,
"step": 2984
},
{
"epoch": 1.4755901619082932,
"grad_norm": 0.1089453320725611,
"learning_rate": 1.4043400341435105e-05,
"loss": 0.3987,
"step": 2985
},
{
"epoch": 1.4760845383759733,
"grad_norm": 0.11873661440515382,
"learning_rate": 1.4039838646686793e-05,
"loss": 0.3949,
"step": 2986
},
{
"epoch": 1.4765789148436534,
"grad_norm": 0.1278286936488067,
"learning_rate": 1.4036276339401976e-05,
"loss": 0.4164,
"step": 2987
},
{
"epoch": 1.4770732913113336,
"grad_norm": 0.11728440987682373,
"learning_rate": 1.4032713420120774e-05,
"loss": 0.4009,
"step": 2988
},
{
"epoch": 1.4775676677790137,
"grad_norm": 0.11553053321098335,
"learning_rate": 1.402914988938342e-05,
"loss": 0.4121,
"step": 2989
},
{
"epoch": 1.4780620442466939,
"grad_norm": 0.1203103607235583,
"learning_rate": 1.4025585747730226e-05,
"loss": 0.4351,
"step": 2990
},
{
"epoch": 1.478556420714374,
"grad_norm": 0.11853230087920537,
"learning_rate": 1.4022020995701602e-05,
"loss": 0.4082,
"step": 2991
},
{
"epoch": 1.4790507971820541,
"grad_norm": 0.11730854059191914,
"learning_rate": 1.4018455633838051e-05,
"loss": 0.4207,
"step": 2992
},
{
"epoch": 1.4795451736497343,
"grad_norm": 0.12095553571537349,
"learning_rate": 1.4014889662680169e-05,
"loss": 0.3822,
"step": 2993
},
{
"epoch": 1.4800395501174144,
"grad_norm": 0.11369260886077,
"learning_rate": 1.4011323082768638e-05,
"loss": 0.3939,
"step": 2994
},
{
"epoch": 1.4805339265850945,
"grad_norm": 0.11331592531332726,
"learning_rate": 1.4007755894644239e-05,
"loss": 0.4923,
"step": 2995
},
{
"epoch": 1.4810283030527747,
"grad_norm": 1.0984003015009571,
"learning_rate": 1.4004188098847848e-05,
"loss": 0.4046,
"step": 2996
},
{
"epoch": 1.4815226795204548,
"grad_norm": 0.11355962112684552,
"learning_rate": 1.4000619695920428e-05,
"loss": 0.4075,
"step": 2997
},
{
"epoch": 1.482017055988135,
"grad_norm": 0.11588799338000505,
"learning_rate": 1.3997050686403027e-05,
"loss": 0.4073,
"step": 2998
},
{
"epoch": 1.482511432455815,
"grad_norm": 0.12589476580674744,
"learning_rate": 1.3993481070836797e-05,
"loss": 0.3887,
"step": 2999
},
{
"epoch": 1.4830058089234952,
"grad_norm": 0.16060700611668016,
"learning_rate": 1.398991084976298e-05,
"loss": 0.4252,
"step": 3000
},
{
"epoch": 1.4835001853911753,
"grad_norm": 0.21817121237080428,
"learning_rate": 1.3986340023722902e-05,
"loss": 0.3939,
"step": 3001
},
{
"epoch": 1.4839945618588555,
"grad_norm": 0.3657537218739236,
"learning_rate": 1.3982768593257989e-05,
"loss": 0.4213,
"step": 3002
},
{
"epoch": 1.4844889383265356,
"grad_norm": 0.13348175661173958,
"learning_rate": 1.3979196558909758e-05,
"loss": 0.4059,
"step": 3003
},
{
"epoch": 1.4849833147942157,
"grad_norm": 0.13719134724595453,
"learning_rate": 1.397562392121981e-05,
"loss": 0.3763,
"step": 3004
},
{
"epoch": 1.4854776912618959,
"grad_norm": 0.11861251489966798,
"learning_rate": 1.3972050680729845e-05,
"loss": 0.4109,
"step": 3005
},
{
"epoch": 1.485972067729576,
"grad_norm": 0.13488819538784194,
"learning_rate": 1.3968476837981653e-05,
"loss": 0.4215,
"step": 3006
},
{
"epoch": 1.4864664441972562,
"grad_norm": 0.11976472414655091,
"learning_rate": 1.3964902393517112e-05,
"loss": 0.429,
"step": 3007
},
{
"epoch": 1.4869608206649363,
"grad_norm": 0.1299160454869938,
"learning_rate": 1.3961327347878197e-05,
"loss": 0.4246,
"step": 3008
},
{
"epoch": 1.4874551971326164,
"grad_norm": 0.11530996579450944,
"learning_rate": 1.3957751701606965e-05,
"loss": 0.4072,
"step": 3009
},
{
"epoch": 1.4879495736002966,
"grad_norm": 0.1265578371364316,
"learning_rate": 1.3954175455245578e-05,
"loss": 0.3762,
"step": 3010
},
{
"epoch": 1.4884439500679767,
"grad_norm": 0.11260653918160304,
"learning_rate": 1.3950598609336274e-05,
"loss": 0.404,
"step": 3011
},
{
"epoch": 1.4889383265356568,
"grad_norm": 0.11912470838000411,
"learning_rate": 1.3947021164421388e-05,
"loss": 0.4368,
"step": 3012
},
{
"epoch": 1.489432703003337,
"grad_norm": 0.1528313076039508,
"learning_rate": 1.3943443121043356e-05,
"loss": 0.4123,
"step": 3013
},
{
"epoch": 1.489927079471017,
"grad_norm": 0.11525985947727523,
"learning_rate": 1.3939864479744687e-05,
"loss": 0.4384,
"step": 3014
},
{
"epoch": 1.4904214559386972,
"grad_norm": 0.11291072181611926,
"learning_rate": 1.3936285241067985e-05,
"loss": 0.415,
"step": 3015
},
{
"epoch": 1.4909158324063774,
"grad_norm": 0.12609547312732863,
"learning_rate": 1.393270540555596e-05,
"loss": 0.3904,
"step": 3016
},
{
"epoch": 1.4914102088740577,
"grad_norm": 0.1109281298467323,
"learning_rate": 1.3929124973751393e-05,
"loss": 0.4081,
"step": 3017
},
{
"epoch": 1.4919045853417376,
"grad_norm": 0.12111693814650501,
"learning_rate": 1.3925543946197166e-05,
"loss": 0.3846,
"step": 3018
},
{
"epoch": 1.492398961809418,
"grad_norm": 0.11127923358879191,
"learning_rate": 1.3921962323436249e-05,
"loss": 0.4255,
"step": 3019
},
{
"epoch": 1.492893338277098,
"grad_norm": 0.11801454104955668,
"learning_rate": 1.39183801060117e-05,
"loss": 0.4165,
"step": 3020
},
{
"epoch": 1.4933877147447783,
"grad_norm": 0.11361293080986422,
"learning_rate": 1.391479729446667e-05,
"loss": 0.3951,
"step": 3021
},
{
"epoch": 1.4938820912124582,
"grad_norm": 0.10745929519007927,
"learning_rate": 1.3911213889344403e-05,
"loss": 0.3891,
"step": 3022
},
{
"epoch": 1.4943764676801385,
"grad_norm": 0.11346585364339809,
"learning_rate": 1.3907629891188224e-05,
"loss": 0.3873,
"step": 3023
},
{
"epoch": 1.4948708441478185,
"grad_norm": 0.11018583901420716,
"learning_rate": 1.3904045300541556e-05,
"loss": 0.4015,
"step": 3024
},
{
"epoch": 1.4953652206154988,
"grad_norm": 0.11256201308420574,
"learning_rate": 1.3900460117947905e-05,
"loss": 0.3827,
"step": 3025
},
{
"epoch": 1.4958595970831787,
"grad_norm": 0.10996959791082725,
"learning_rate": 1.3896874343950877e-05,
"loss": 0.3853,
"step": 3026
},
{
"epoch": 1.496353973550859,
"grad_norm": 0.11049687798428352,
"learning_rate": 1.3893287979094156e-05,
"loss": 0.3891,
"step": 3027
},
{
"epoch": 1.496848350018539,
"grad_norm": 0.11122757571235894,
"learning_rate": 1.3889701023921523e-05,
"loss": 0.3881,
"step": 3028
},
{
"epoch": 1.4973427264862194,
"grad_norm": 0.10770017398228586,
"learning_rate": 1.3886113478976848e-05,
"loss": 0.422,
"step": 3029
},
{
"epoch": 1.4978371029538993,
"grad_norm": 0.1118295469280641,
"learning_rate": 1.388252534480409e-05,
"loss": 0.4029,
"step": 3030
},
{
"epoch": 1.4983314794215796,
"grad_norm": 0.10764938070703242,
"learning_rate": 1.387893662194729e-05,
"loss": 0.4033,
"step": 3031
},
{
"epoch": 1.4988258558892595,
"grad_norm": 0.13010759922091858,
"learning_rate": 1.3875347310950588e-05,
"loss": 0.4384,
"step": 3032
},
{
"epoch": 1.49932023235694,
"grad_norm": 0.12026334298384601,
"learning_rate": 1.3871757412358213e-05,
"loss": 0.4314,
"step": 3033
},
{
"epoch": 1.4998146088246198,
"grad_norm": 0.11861796281966609,
"learning_rate": 1.3868166926714474e-05,
"loss": 0.3999,
"step": 3034
},
{
"epoch": 1.5003089852923002,
"grad_norm": 0.11619816669162683,
"learning_rate": 1.3864575854563783e-05,
"loss": 0.4143,
"step": 3035
},
{
"epoch": 1.50080336175998,
"grad_norm": 0.11605039681834961,
"learning_rate": 1.3860984196450621e-05,
"loss": 0.528,
"step": 3036
},
{
"epoch": 1.50080336175998,
"eval_loss": 0.49908125400543213,
"eval_runtime": 100.8986,
"eval_samples_per_second": 300.837,
"eval_steps_per_second": 37.612,
"step": 3036
},
{
"epoch": 1.5012977382276604,
"grad_norm": 0.6313809611151388,
"learning_rate": 1.3857391952919581e-05,
"loss": 0.4161,
"step": 3037
},
{
"epoch": 1.5017921146953404,
"grad_norm": 0.11137292177196309,
"learning_rate": 1.3853799124515326e-05,
"loss": 0.3875,
"step": 3038
},
{
"epoch": 1.5022864911630207,
"grad_norm": 0.11461183513375389,
"learning_rate": 1.3850205711782618e-05,
"loss": 0.4483,
"step": 3039
},
{
"epoch": 1.5027808676307006,
"grad_norm": 0.12046601393131068,
"learning_rate": 1.3846611715266305e-05,
"loss": 0.4086,
"step": 3040
},
{
"epoch": 1.503275244098381,
"grad_norm": 0.1210489008323169,
"learning_rate": 1.384301713551132e-05,
"loss": 0.4453,
"step": 3041
},
{
"epoch": 1.503769620566061,
"grad_norm": 0.11349317862943961,
"learning_rate": 1.383942197306269e-05,
"loss": 0.4141,
"step": 3042
},
{
"epoch": 1.5042639970337413,
"grad_norm": 0.18769661273845756,
"learning_rate": 1.3835826228465531e-05,
"loss": 0.4109,
"step": 3043
},
{
"epoch": 1.5047583735014214,
"grad_norm": 0.11763553305331662,
"learning_rate": 1.3832229902265039e-05,
"loss": 0.3693,
"step": 3044
},
{
"epoch": 1.5052527499691015,
"grad_norm": 0.11046969541629462,
"learning_rate": 1.3828632995006504e-05,
"loss": 0.4213,
"step": 3045
},
{
"epoch": 1.5057471264367817,
"grad_norm": 0.1153368740857275,
"learning_rate": 1.3825035507235307e-05,
"loss": 0.4152,
"step": 3046
},
{
"epoch": 1.5062415029044618,
"grad_norm": 0.11788981681894665,
"learning_rate": 1.382143743949691e-05,
"loss": 0.4068,
"step": 3047
},
{
"epoch": 1.506735879372142,
"grad_norm": 0.11965799562981502,
"learning_rate": 1.3817838792336866e-05,
"loss": 0.4119,
"step": 3048
},
{
"epoch": 1.507230255839822,
"grad_norm": 0.11390499771131034,
"learning_rate": 1.3814239566300822e-05,
"loss": 0.3953,
"step": 3049
},
{
"epoch": 1.5077246323075022,
"grad_norm": 0.11865674659525038,
"learning_rate": 1.38106397619345e-05,
"loss": 0.407,
"step": 3050
},
{
"epoch": 1.5082190087751823,
"grad_norm": 0.11982916141704493,
"learning_rate": 1.380703937978372e-05,
"loss": 0.4007,
"step": 3051
},
{
"epoch": 1.5087133852428625,
"grad_norm": 0.11631814036635767,
"learning_rate": 1.3803438420394386e-05,
"loss": 0.4061,
"step": 3052
},
{
"epoch": 1.5092077617105426,
"grad_norm": 0.1265447971308802,
"learning_rate": 1.3799836884312492e-05,
"loss": 0.4208,
"step": 3053
},
{
"epoch": 1.5097021381782227,
"grad_norm": 0.12315106828051682,
"learning_rate": 1.3796234772084114e-05,
"loss": 0.4135,
"step": 3054
},
{
"epoch": 1.5101965146459029,
"grad_norm": 0.1121226510815428,
"learning_rate": 1.3792632084255423e-05,
"loss": 0.395,
"step": 3055
},
{
"epoch": 1.510690891113583,
"grad_norm": 0.11450385773410972,
"learning_rate": 1.3789028821372665e-05,
"loss": 0.3802,
"step": 3056
},
{
"epoch": 1.5111852675812631,
"grad_norm": 0.1124996864675293,
"learning_rate": 1.378542498398219e-05,
"loss": 0.399,
"step": 3057
},
{
"epoch": 1.5116796440489433,
"grad_norm": 0.11668673383187805,
"learning_rate": 1.3781820572630417e-05,
"loss": 0.4325,
"step": 3058
},
{
"epoch": 1.5121740205166234,
"grad_norm": 0.11848906773884604,
"learning_rate": 1.3778215587863875e-05,
"loss": 0.4181,
"step": 3059
},
{
"epoch": 1.5126683969843036,
"grad_norm": 0.11148775658565852,
"learning_rate": 1.3774610030229152e-05,
"loss": 0.3936,
"step": 3060
},
{
"epoch": 1.5131627734519837,
"grad_norm": 0.1182804314916288,
"learning_rate": 1.3771003900272941e-05,
"loss": 0.4121,
"step": 3061
},
{
"epoch": 1.5136571499196638,
"grad_norm": 0.114672427061065,
"learning_rate": 1.3767397198542027e-05,
"loss": 0.4337,
"step": 3062
},
{
"epoch": 1.514151526387344,
"grad_norm": 0.11391345093208577,
"learning_rate": 1.3763789925583263e-05,
"loss": 0.3976,
"step": 3063
},
{
"epoch": 1.514645902855024,
"grad_norm": 0.10837296844569731,
"learning_rate": 1.3760182081943595e-05,
"loss": 0.435,
"step": 3064
},
{
"epoch": 1.5151402793227042,
"grad_norm": 0.11337037540978191,
"learning_rate": 1.375657366817007e-05,
"loss": 0.4282,
"step": 3065
},
{
"epoch": 1.5156346557903844,
"grad_norm": 0.11437987727634404,
"learning_rate": 1.3752964684809802e-05,
"loss": 0.4135,
"step": 3066
},
{
"epoch": 1.5161290322580645,
"grad_norm": 0.11360461950298938,
"learning_rate": 1.3749355132410002e-05,
"loss": 0.4291,
"step": 3067
},
{
"epoch": 1.5166234087257446,
"grad_norm": 0.1142738070923594,
"learning_rate": 1.3745745011517969e-05,
"loss": 0.379,
"step": 3068
},
{
"epoch": 1.5171177851934248,
"grad_norm": 0.11456773556488317,
"learning_rate": 1.3742134322681074e-05,
"loss": 0.4093,
"step": 3069
},
{
"epoch": 1.517612161661105,
"grad_norm": 0.12496428198148511,
"learning_rate": 1.3738523066446794e-05,
"loss": 0.4242,
"step": 3070
},
{
"epoch": 1.518106538128785,
"grad_norm": 0.11381404252212514,
"learning_rate": 1.3734911243362674e-05,
"loss": 0.4086,
"step": 3071
},
{
"epoch": 1.5186009145964652,
"grad_norm": 0.10503270743847272,
"learning_rate": 1.373129885397636e-05,
"loss": 0.4052,
"step": 3072
},
{
"epoch": 1.5190952910641453,
"grad_norm": 0.11078243236695988,
"learning_rate": 1.3727685898835574e-05,
"loss": 0.3937,
"step": 3073
},
{
"epoch": 1.5195896675318254,
"grad_norm": 0.1195616081706004,
"learning_rate": 1.3724072378488124e-05,
"loss": 0.404,
"step": 3074
},
{
"epoch": 1.5200840439995056,
"grad_norm": 0.10981433136000109,
"learning_rate": 1.372045829348191e-05,
"loss": 0.4476,
"step": 3075
},
{
"epoch": 1.5205784204671857,
"grad_norm": 0.1108571344537307,
"learning_rate": 1.3716843644364914e-05,
"loss": 0.4316,
"step": 3076
},
{
"epoch": 1.5210727969348659,
"grad_norm": 0.11873292088799006,
"learning_rate": 1.3713228431685201e-05,
"loss": 0.4124,
"step": 3077
},
{
"epoch": 1.521567173402546,
"grad_norm": 0.10870394196812977,
"learning_rate": 1.3709612655990928e-05,
"loss": 0.3872,
"step": 3078
},
{
"epoch": 1.5220615498702261,
"grad_norm": 0.10503218021553531,
"learning_rate": 1.3705996317830333e-05,
"loss": 0.4315,
"step": 3079
},
{
"epoch": 1.5225559263379063,
"grad_norm": 0.12084831304984092,
"learning_rate": 1.3702379417751734e-05,
"loss": 0.4025,
"step": 3080
},
{
"epoch": 1.5230503028055864,
"grad_norm": 0.11284648924203107,
"learning_rate": 1.3698761956303543e-05,
"loss": 0.4062,
"step": 3081
},
{
"epoch": 1.5235446792732668,
"grad_norm": 0.11709099848352039,
"learning_rate": 1.369514393403426e-05,
"loss": 0.3975,
"step": 3082
},
{
"epoch": 1.5240390557409467,
"grad_norm": 0.11238927269229194,
"learning_rate": 1.3691525351492452e-05,
"loss": 0.4077,
"step": 3083
},
{
"epoch": 1.524533432208627,
"grad_norm": 0.2546335480701346,
"learning_rate": 1.3687906209226794e-05,
"loss": 0.4015,
"step": 3084
},
{
"epoch": 1.525027808676307,
"grad_norm": 0.11758772151573564,
"learning_rate": 1.3684286507786029e-05,
"loss": 0.4148,
"step": 3085
},
{
"epoch": 1.5255221851439873,
"grad_norm": 0.10723925768349657,
"learning_rate": 1.3680666247718991e-05,
"loss": 0.4112,
"step": 3086
},
{
"epoch": 1.5260165616116672,
"grad_norm": 0.11070518011307778,
"learning_rate": 1.3677045429574602e-05,
"loss": 0.3997,
"step": 3087
},
{
"epoch": 1.5265109380793476,
"grad_norm": 0.1231862256208089,
"learning_rate": 1.3673424053901862e-05,
"loss": 0.4042,
"step": 3088
},
{
"epoch": 1.5270053145470275,
"grad_norm": 0.1114031834530504,
"learning_rate": 1.3669802121249857e-05,
"loss": 0.4145,
"step": 3089
},
{
"epoch": 1.5274996910147078,
"grad_norm": 0.11731402081328049,
"learning_rate": 1.3666179632167764e-05,
"loss": 0.4093,
"step": 3090
},
{
"epoch": 1.5279940674823878,
"grad_norm": 0.11008908270593804,
"learning_rate": 1.3662556587204832e-05,
"loss": 0.4191,
"step": 3091
},
{
"epoch": 1.528488443950068,
"grad_norm": 0.10853270474836299,
"learning_rate": 1.365893298691041e-05,
"loss": 0.4046,
"step": 3092
},
{
"epoch": 1.528982820417748,
"grad_norm": 0.11149140240188878,
"learning_rate": 1.3655308831833915e-05,
"loss": 0.4401,
"step": 3093
},
{
"epoch": 1.5294771968854284,
"grad_norm": 0.12003008774745683,
"learning_rate": 1.3651684122524857e-05,
"loss": 0.3923,
"step": 3094
},
{
"epoch": 1.5299715733531083,
"grad_norm": 0.11048854987285266,
"learning_rate": 1.3648058859532839e-05,
"loss": 0.426,
"step": 3095
},
{
"epoch": 1.5304659498207887,
"grad_norm": 0.1131050135771004,
"learning_rate": 1.3644433043407526e-05,
"loss": 0.3828,
"step": 3096
},
{
"epoch": 1.5309603262884686,
"grad_norm": 0.1254743675828906,
"learning_rate": 1.3640806674698681e-05,
"loss": 0.418,
"step": 3097
},
{
"epoch": 1.531454702756149,
"grad_norm": 0.11242735723714233,
"learning_rate": 1.3637179753956154e-05,
"loss": 0.4023,
"step": 3098
},
{
"epoch": 1.5319490792238288,
"grad_norm": 0.11654763068488853,
"learning_rate": 1.3633552281729866e-05,
"loss": 0.3984,
"step": 3099
},
{
"epoch": 1.5324434556915092,
"grad_norm": 0.12222022973161577,
"learning_rate": 1.3629924258569835e-05,
"loss": 0.4019,
"step": 3100
},
{
"epoch": 1.532937832159189,
"grad_norm": 0.10463522032735743,
"learning_rate": 1.3626295685026154e-05,
"loss": 0.4009,
"step": 3101
},
{
"epoch": 1.5334322086268695,
"grad_norm": 0.10605542649375482,
"learning_rate": 1.3622666561649004e-05,
"loss": 0.4031,
"step": 3102
},
{
"epoch": 1.5339265850945494,
"grad_norm": 0.1265058712810658,
"learning_rate": 1.3619036888988642e-05,
"loss": 0.3891,
"step": 3103
},
{
"epoch": 1.5344209615622297,
"grad_norm": 0.1109786094982626,
"learning_rate": 1.3615406667595417e-05,
"loss": 0.3726,
"step": 3104
},
{
"epoch": 1.5349153380299096,
"grad_norm": 0.11884198486640739,
"learning_rate": 1.3611775898019757e-05,
"loss": 0.412,
"step": 3105
},
{
"epoch": 1.53540971449759,
"grad_norm": 0.1085345735340863,
"learning_rate": 1.3608144580812176e-05,
"loss": 0.3915,
"step": 3106
},
{
"epoch": 1.53590409096527,
"grad_norm": 0.10799743372698864,
"learning_rate": 1.3604512716523262e-05,
"loss": 0.4178,
"step": 3107
},
{
"epoch": 1.5363984674329503,
"grad_norm": 0.1143228684170182,
"learning_rate": 1.3600880305703704e-05,
"loss": 0.4062,
"step": 3108
},
{
"epoch": 1.5368928439006302,
"grad_norm": 0.10709689798086638,
"learning_rate": 1.3597247348904253e-05,
"loss": 0.3882,
"step": 3109
},
{
"epoch": 1.5373872203683105,
"grad_norm": 0.10395316476530175,
"learning_rate": 1.3593613846675755e-05,
"loss": 0.4177,
"step": 3110
},
{
"epoch": 1.5378815968359905,
"grad_norm": 0.1465241285309284,
"learning_rate": 1.3589979799569137e-05,
"loss": 0.4021,
"step": 3111
},
{
"epoch": 1.5383759733036708,
"grad_norm": 0.1035481022442761,
"learning_rate": 1.3586345208135411e-05,
"loss": 0.398,
"step": 3112
},
{
"epoch": 1.5388703497713507,
"grad_norm": 0.11237333320371597,
"learning_rate": 1.3582710072925664e-05,
"loss": 0.4309,
"step": 3113
},
{
"epoch": 1.539364726239031,
"grad_norm": 0.1172003461914945,
"learning_rate": 1.357907439449107e-05,
"loss": 0.4508,
"step": 3114
},
{
"epoch": 1.539859102706711,
"grad_norm": 0.3926197047126389,
"learning_rate": 1.3575438173382888e-05,
"loss": 0.3967,
"step": 3115
},
{
"epoch": 1.5403534791743914,
"grad_norm": 0.11480849356146751,
"learning_rate": 1.3571801410152449e-05,
"loss": 0.3857,
"step": 3116
},
{
"epoch": 1.5408478556420713,
"grad_norm": 0.11342082090084649,
"learning_rate": 1.3568164105351185e-05,
"loss": 0.4396,
"step": 3117
},
{
"epoch": 1.5413422321097516,
"grad_norm": 0.13347810043905356,
"learning_rate": 1.356452625953059e-05,
"loss": 0.4245,
"step": 3118
},
{
"epoch": 1.5418366085774318,
"grad_norm": 0.12472056584805008,
"learning_rate": 1.3560887873242253e-05,
"loss": 0.4224,
"step": 3119
},
{
"epoch": 1.542330985045112,
"grad_norm": 0.12277469152731209,
"learning_rate": 1.3557248947037837e-05,
"loss": 0.4257,
"step": 3120
},
{
"epoch": 1.542825361512792,
"grad_norm": 0.11842180182375087,
"learning_rate": 1.3553609481469094e-05,
"loss": 0.3773,
"step": 3121
},
{
"epoch": 1.5433197379804722,
"grad_norm": 0.11384173608229066,
"learning_rate": 1.3549969477087853e-05,
"loss": 0.4059,
"step": 3122
},
{
"epoch": 1.5438141144481523,
"grad_norm": 0.1096752443805573,
"learning_rate": 1.3546328934446027e-05,
"loss": 0.4044,
"step": 3123
},
{
"epoch": 1.5443084909158324,
"grad_norm": 0.1273415124107835,
"learning_rate": 1.3542687854095604e-05,
"loss": 0.3954,
"step": 3124
},
{
"epoch": 1.5448028673835126,
"grad_norm": 0.11238429061006139,
"learning_rate": 1.3539046236588672e-05,
"loss": 0.4069,
"step": 3125
},
{
"epoch": 1.5452972438511927,
"grad_norm": 0.12105730484972055,
"learning_rate": 1.3535404082477375e-05,
"loss": 0.4049,
"step": 3126
},
{
"epoch": 1.5457916203188728,
"grad_norm": 0.1251646492362749,
"learning_rate": 1.3531761392313953e-05,
"loss": 0.4516,
"step": 3127
},
{
"epoch": 1.546285996786553,
"grad_norm": 0.11660952075838484,
"learning_rate": 1.3528118166650732e-05,
"loss": 0.3785,
"step": 3128
},
{
"epoch": 1.5467803732542331,
"grad_norm": 0.12090045919867062,
"learning_rate": 1.3524474406040105e-05,
"loss": 0.398,
"step": 3129
},
{
"epoch": 1.5472747497219133,
"grad_norm": 0.11120899679362056,
"learning_rate": 1.352083011103456e-05,
"loss": 0.4223,
"step": 3130
},
{
"epoch": 1.5477691261895934,
"grad_norm": 0.158604568358294,
"learning_rate": 1.3517185282186659e-05,
"loss": 0.4357,
"step": 3131
},
{
"epoch": 1.5482635026572735,
"grad_norm": 0.12274136500871871,
"learning_rate": 1.3513539920049034e-05,
"loss": 0.4315,
"step": 3132
},
{
"epoch": 1.5487578791249537,
"grad_norm": 0.11411466772037757,
"learning_rate": 1.3509894025174423e-05,
"loss": 0.3933,
"step": 3133
},
{
"epoch": 1.5492522555926338,
"grad_norm": 0.10720117391613321,
"learning_rate": 1.3506247598115629e-05,
"loss": 0.3921,
"step": 3134
},
{
"epoch": 1.549746632060314,
"grad_norm": 0.1152692665003032,
"learning_rate": 1.3502600639425535e-05,
"loss": 0.4355,
"step": 3135
},
{
"epoch": 1.550241008527994,
"grad_norm": 0.11545676097649347,
"learning_rate": 1.3498953149657105e-05,
"loss": 0.398,
"step": 3136
},
{
"epoch": 1.5507353849956742,
"grad_norm": 0.11146641718411135,
"learning_rate": 1.349530512936339e-05,
"loss": 0.4096,
"step": 3137
},
{
"epoch": 1.5512297614633543,
"grad_norm": 0.11908599525133702,
"learning_rate": 1.3491656579097518e-05,
"loss": 0.4438,
"step": 3138
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.10863307641032378,
"learning_rate": 1.3488007499412694e-05,
"loss": 0.3716,
"step": 3139
},
{
"epoch": 1.5522185143987146,
"grad_norm": 0.10475628631851268,
"learning_rate": 1.3484357890862203e-05,
"loss": 0.3823,
"step": 3140
},
{
"epoch": 1.5527128908663947,
"grad_norm": 0.10964847624838454,
"learning_rate": 1.3480707753999424e-05,
"loss": 0.4077,
"step": 3141
},
{
"epoch": 1.5532072673340749,
"grad_norm": 0.11502876528968266,
"learning_rate": 1.34770570893778e-05,
"loss": 0.4076,
"step": 3142
},
{
"epoch": 1.553701643801755,
"grad_norm": 0.11123947859763028,
"learning_rate": 1.347340589755085e-05,
"loss": 0.3796,
"step": 3143
},
{
"epoch": 1.5541960202694352,
"grad_norm": 0.11174700032565488,
"learning_rate": 1.3469754179072198e-05,
"loss": 0.4016,
"step": 3144
},
{
"epoch": 1.5546903967371153,
"grad_norm": 0.11349315755831145,
"learning_rate": 1.3466101934495522e-05,
"loss": 0.4135,
"step": 3145
},
{
"epoch": 1.5551847732047954,
"grad_norm": 0.10721735962599743,
"learning_rate": 1.3462449164374591e-05,
"loss": 0.3908,
"step": 3146
},
{
"epoch": 1.5556791496724756,
"grad_norm": 0.22336761070204042,
"learning_rate": 1.3458795869263258e-05,
"loss": 0.4235,
"step": 3147
},
{
"epoch": 1.5561735261401557,
"grad_norm": 0.1068439525027523,
"learning_rate": 1.3455142049715444e-05,
"loss": 0.4024,
"step": 3148
},
{
"epoch": 1.5566679026078358,
"grad_norm": 0.11721098982886691,
"learning_rate": 1.3451487706285158e-05,
"loss": 0.4273,
"step": 3149
},
{
"epoch": 1.557162279075516,
"grad_norm": 0.1192806097384965,
"learning_rate": 1.3447832839526488e-05,
"loss": 0.3976,
"step": 3150
},
{
"epoch": 1.557656655543196,
"grad_norm": 0.11436877514330146,
"learning_rate": 1.3444177449993598e-05,
"loss": 0.4528,
"step": 3151
},
{
"epoch": 1.5581510320108762,
"grad_norm": 0.11168237513273276,
"learning_rate": 1.3440521538240732e-05,
"loss": 0.4116,
"step": 3152
},
{
"epoch": 1.5586454084785564,
"grad_norm": 0.11197994930913506,
"learning_rate": 1.3436865104822217e-05,
"loss": 0.4267,
"step": 3153
},
{
"epoch": 1.5591397849462365,
"grad_norm": 0.11390845518089386,
"learning_rate": 1.3433208150292451e-05,
"loss": 0.4114,
"step": 3154
},
{
"epoch": 1.5596341614139169,
"grad_norm": 0.11149252396362204,
"learning_rate": 1.342955067520592e-05,
"loss": 0.3982,
"step": 3155
},
{
"epoch": 1.5601285378815968,
"grad_norm": 0.10871657435633789,
"learning_rate": 1.3425892680117185e-05,
"loss": 0.4173,
"step": 3156
},
{
"epoch": 1.5606229143492771,
"grad_norm": 0.11270606052203658,
"learning_rate": 1.3422234165580884e-05,
"loss": 0.4339,
"step": 3157
},
{
"epoch": 1.561117290816957,
"grad_norm": 0.12069338766980756,
"learning_rate": 1.3418575132151736e-05,
"loss": 0.4341,
"step": 3158
},
{
"epoch": 1.5616116672846374,
"grad_norm": 0.11127714888815304,
"learning_rate": 1.3414915580384538e-05,
"loss": 0.4102,
"step": 3159
},
{
"epoch": 1.5621060437523173,
"grad_norm": 0.11381352980393025,
"learning_rate": 1.3411255510834166e-05,
"loss": 0.4237,
"step": 3160
},
{
"epoch": 1.5626004202199977,
"grad_norm": 0.10771536696206481,
"learning_rate": 1.340759492405558e-05,
"loss": 0.4059,
"step": 3161
},
{
"epoch": 1.5630947966876776,
"grad_norm": 0.1132864349695712,
"learning_rate": 1.3403933820603806e-05,
"loss": 0.435,
"step": 3162
},
{
"epoch": 1.563589173155358,
"grad_norm": 0.11194618025536371,
"learning_rate": 1.3400272201033952e-05,
"loss": 0.4073,
"step": 3163
},
{
"epoch": 1.5640835496230379,
"grad_norm": 0.10771587675859809,
"learning_rate": 1.3396610065901219e-05,
"loss": 0.4208,
"step": 3164
},
{
"epoch": 1.5645779260907182,
"grad_norm": 0.10841855432597079,
"learning_rate": 1.3392947415760864e-05,
"loss": 0.4169,
"step": 3165
},
{
"epoch": 1.5650723025583981,
"grad_norm": 0.1129691329653808,
"learning_rate": 1.3389284251168237e-05,
"loss": 0.4054,
"step": 3166
},
{
"epoch": 1.5655666790260785,
"grad_norm": 0.11137234691601419,
"learning_rate": 1.3385620572678763e-05,
"loss": 0.4089,
"step": 3167
},
{
"epoch": 1.5660610554937584,
"grad_norm": 0.11439561204837732,
"learning_rate": 1.3381956380847942e-05,
"loss": 0.4124,
"step": 3168
},
{
"epoch": 1.5665554319614388,
"grad_norm": 0.11317558632682163,
"learning_rate": 1.3378291676231355e-05,
"loss": 0.4124,
"step": 3169
},
{
"epoch": 1.5670498084291187,
"grad_norm": 0.1102491587540443,
"learning_rate": 1.3374626459384655e-05,
"loss": 0.3935,
"step": 3170
},
{
"epoch": 1.567544184896799,
"grad_norm": 0.10990018904401277,
"learning_rate": 1.337096073086358e-05,
"loss": 0.3917,
"step": 3171
},
{
"epoch": 1.568038561364479,
"grad_norm": 0.11161798191301849,
"learning_rate": 1.3367294491223944e-05,
"loss": 0.3924,
"step": 3172
},
{
"epoch": 1.5685329378321593,
"grad_norm": 0.1083671923543046,
"learning_rate": 1.336362774102163e-05,
"loss": 0.4076,
"step": 3173
},
{
"epoch": 1.5690273142998392,
"grad_norm": 0.11301735563066916,
"learning_rate": 1.3359960480812614e-05,
"loss": 0.4099,
"step": 3174
},
{
"epoch": 1.5695216907675196,
"grad_norm": 0.11447057687055799,
"learning_rate": 1.3356292711152938e-05,
"loss": 0.4159,
"step": 3175
},
{
"epoch": 1.5700160672351995,
"grad_norm": 0.13325043738629413,
"learning_rate": 1.3352624432598717e-05,
"loss": 0.3772,
"step": 3176
},
{
"epoch": 1.5705104437028798,
"grad_norm": 0.12802538289751608,
"learning_rate": 1.3348955645706162e-05,
"loss": 0.4188,
"step": 3177
},
{
"epoch": 1.5710048201705598,
"grad_norm": 0.11625085927557574,
"learning_rate": 1.3345286351031544e-05,
"loss": 0.3943,
"step": 3178
},
{
"epoch": 1.5714991966382401,
"grad_norm": 0.10782210467291145,
"learning_rate": 1.3341616549131209e-05,
"loss": 0.4184,
"step": 3179
},
{
"epoch": 1.57199357310592,
"grad_norm": 0.1375396761734693,
"learning_rate": 1.3337946240561595e-05,
"loss": 0.393,
"step": 3180
},
{
"epoch": 1.5724879495736004,
"grad_norm": 0.10856862808886847,
"learning_rate": 1.3334275425879208e-05,
"loss": 0.4497,
"step": 3181
},
{
"epoch": 1.5729823260412803,
"grad_norm": 0.12458287768256038,
"learning_rate": 1.3330604105640633e-05,
"loss": 0.4303,
"step": 3182
},
{
"epoch": 1.5734767025089607,
"grad_norm": 0.11262695497911701,
"learning_rate": 1.3326932280402524e-05,
"loss": 0.4309,
"step": 3183
},
{
"epoch": 1.5739710789766406,
"grad_norm": 0.10919377565998686,
"learning_rate": 1.3323259950721626e-05,
"loss": 0.4025,
"step": 3184
},
{
"epoch": 1.574465455444321,
"grad_norm": 0.10979295533398342,
"learning_rate": 1.3319587117154746e-05,
"loss": 0.4128,
"step": 3185
},
{
"epoch": 1.5749598319120008,
"grad_norm": 0.11975350694855956,
"learning_rate": 1.3315913780258778e-05,
"loss": 0.3914,
"step": 3186
},
{
"epoch": 1.5754542083796812,
"grad_norm": 0.11035094154383818,
"learning_rate": 1.3312239940590683e-05,
"loss": 0.3828,
"step": 3187
},
{
"epoch": 1.5759485848473611,
"grad_norm": 0.11729761111006012,
"learning_rate": 1.3308565598707508e-05,
"loss": 0.384,
"step": 3188
},
{
"epoch": 1.5764429613150415,
"grad_norm": 0.1179022324271253,
"learning_rate": 1.3304890755166366e-05,
"loss": 0.4082,
"step": 3189
},
{
"epoch": 1.5769373377827214,
"grad_norm": 0.11125394080599833,
"learning_rate": 1.3301215410524462e-05,
"loss": 0.3893,
"step": 3190
},
{
"epoch": 1.5774317142504017,
"grad_norm": 0.10952964668798391,
"learning_rate": 1.3297539565339057e-05,
"loss": 0.396,
"step": 3191
},
{
"epoch": 1.5779260907180817,
"grad_norm": 0.10973654361451052,
"learning_rate": 1.3293863220167497e-05,
"loss": 0.4036,
"step": 3192
},
{
"epoch": 1.578420467185762,
"grad_norm": 0.111160679826495,
"learning_rate": 1.329018637556721e-05,
"loss": 0.3943,
"step": 3193
},
{
"epoch": 1.5789148436534421,
"grad_norm": 0.1135636353400979,
"learning_rate": 1.3286509032095691e-05,
"loss": 0.4052,
"step": 3194
},
{
"epoch": 1.5794092201211223,
"grad_norm": 0.11291422387439864,
"learning_rate": 1.3282831190310513e-05,
"loss": 0.3966,
"step": 3195
},
{
"epoch": 1.5799035965888024,
"grad_norm": 0.1129082518249767,
"learning_rate": 1.3279152850769323e-05,
"loss": 0.4213,
"step": 3196
},
{
"epoch": 1.5803979730564826,
"grad_norm": 0.1156456958603764,
"learning_rate": 1.3275474014029855e-05,
"loss": 0.4011,
"step": 3197
},
{
"epoch": 1.5808923495241627,
"grad_norm": 0.11104077670590694,
"learning_rate": 1.3271794680649897e-05,
"loss": 0.4064,
"step": 3198
},
{
"epoch": 1.5813867259918428,
"grad_norm": 0.1141695135847261,
"learning_rate": 1.326811485118733e-05,
"loss": 0.415,
"step": 3199
},
{
"epoch": 1.581881102459523,
"grad_norm": 0.11563995402333277,
"learning_rate": 1.3264434526200105e-05,
"loss": 0.4073,
"step": 3200
},
{
"epoch": 1.582375478927203,
"grad_norm": 0.11383239807113227,
"learning_rate": 1.3260753706246247e-05,
"loss": 0.4025,
"step": 3201
},
{
"epoch": 1.5828698553948832,
"grad_norm": 0.10898526758801608,
"learning_rate": 1.3257072391883856e-05,
"loss": 0.3942,
"step": 3202
},
{
"epoch": 1.5833642318625634,
"grad_norm": 0.1150094240743087,
"learning_rate": 1.3253390583671109e-05,
"loss": 0.434,
"step": 3203
},
{
"epoch": 1.5838586083302435,
"grad_norm": 0.11096057460106108,
"learning_rate": 1.3249708282166255e-05,
"loss": 0.4185,
"step": 3204
},
{
"epoch": 1.5843529847979236,
"grad_norm": 0.1296474336518383,
"learning_rate": 1.3246025487927617e-05,
"loss": 0.4298,
"step": 3205
},
{
"epoch": 1.5848473612656038,
"grad_norm": 0.11190048181176888,
"learning_rate": 1.3242342201513599e-05,
"loss": 0.3905,
"step": 3206
},
{
"epoch": 1.585341737733284,
"grad_norm": 0.11500564316729253,
"learning_rate": 1.3238658423482675e-05,
"loss": 0.3972,
"step": 3207
},
{
"epoch": 1.585836114200964,
"grad_norm": 0.104777364452907,
"learning_rate": 1.3234974154393395e-05,
"loss": 0.4353,
"step": 3208
},
{
"epoch": 1.5863304906686442,
"grad_norm": 0.11754266758109318,
"learning_rate": 1.3231289394804376e-05,
"loss": 0.4218,
"step": 3209
},
{
"epoch": 1.5868248671363243,
"grad_norm": 0.10660983219528616,
"learning_rate": 1.3227604145274327e-05,
"loss": 0.4169,
"step": 3210
},
{
"epoch": 1.5873192436040044,
"grad_norm": 0.12640004061327462,
"learning_rate": 1.3223918406362011e-05,
"loss": 0.3953,
"step": 3211
},
{
"epoch": 1.5878136200716846,
"grad_norm": 0.1052230519391203,
"learning_rate": 1.3220232178626277e-05,
"loss": 0.4024,
"step": 3212
},
{
"epoch": 1.5883079965393647,
"grad_norm": 0.10462718910015606,
"learning_rate": 1.3216545462626051e-05,
"loss": 0.4109,
"step": 3213
},
{
"epoch": 1.5888023730070449,
"grad_norm": 0.12450639566978461,
"learning_rate": 1.321285825892032e-05,
"loss": 0.4071,
"step": 3214
},
{
"epoch": 1.589296749474725,
"grad_norm": 0.11505163473889396,
"learning_rate": 1.3209170568068157e-05,
"loss": 0.4049,
"step": 3215
},
{
"epoch": 1.5897911259424051,
"grad_norm": 0.1074843439053129,
"learning_rate": 1.3205482390628703e-05,
"loss": 0.3917,
"step": 3216
},
{
"epoch": 1.5902855024100853,
"grad_norm": 0.11084655931229465,
"learning_rate": 1.3201793727161174e-05,
"loss": 0.4126,
"step": 3217
},
{
"epoch": 1.5907798788777654,
"grad_norm": 0.11028788099893284,
"learning_rate": 1.319810457822486e-05,
"loss": 0.4258,
"step": 3218
},
{
"epoch": 1.5912742553454455,
"grad_norm": 0.11898714714945254,
"learning_rate": 1.3194414944379125e-05,
"loss": 0.4045,
"step": 3219
},
{
"epoch": 1.5917686318131257,
"grad_norm": 0.10366792381457622,
"learning_rate": 1.3190724826183407e-05,
"loss": 0.4133,
"step": 3220
},
{
"epoch": 1.5922630082808058,
"grad_norm": 0.11333395314787735,
"learning_rate": 1.3187034224197214e-05,
"loss": 0.4334,
"step": 3221
},
{
"epoch": 1.592757384748486,
"grad_norm": 0.11403733784458664,
"learning_rate": 1.3183343138980132e-05,
"loss": 0.3877,
"step": 3222
},
{
"epoch": 1.593251761216166,
"grad_norm": 0.10591126469366587,
"learning_rate": 1.3179651571091818e-05,
"loss": 0.3986,
"step": 3223
},
{
"epoch": 1.5937461376838462,
"grad_norm": 0.11071842751892452,
"learning_rate": 1.3175959521092003e-05,
"loss": 0.4079,
"step": 3224
},
{
"epoch": 1.5942405141515263,
"grad_norm": 0.10715984222109694,
"learning_rate": 1.3172266989540485e-05,
"loss": 0.4506,
"step": 3225
},
{
"epoch": 1.5947348906192065,
"grad_norm": 0.11591684544561284,
"learning_rate": 1.3168573976997148e-05,
"loss": 0.4044,
"step": 3226
},
{
"epoch": 1.5952292670868866,
"grad_norm": 0.10682584280047143,
"learning_rate": 1.3164880484021938e-05,
"loss": 0.3761,
"step": 3227
},
{
"epoch": 1.5957236435545668,
"grad_norm": 0.10913606189671966,
"learning_rate": 1.3161186511174875e-05,
"loss": 0.4452,
"step": 3228
},
{
"epoch": 1.5962180200222469,
"grad_norm": 0.11217785757742775,
"learning_rate": 1.3157492059016055e-05,
"loss": 0.4433,
"step": 3229
},
{
"epoch": 1.5967123964899272,
"grad_norm": 0.10957691569602693,
"learning_rate": 1.315379712810565e-05,
"loss": 0.3886,
"step": 3230
},
{
"epoch": 1.5972067729576072,
"grad_norm": 0.11052520284688971,
"learning_rate": 1.3150101719003896e-05,
"loss": 0.3979,
"step": 3231
},
{
"epoch": 1.5977011494252875,
"grad_norm": 0.10874222852003074,
"learning_rate": 1.3146405832271105e-05,
"loss": 0.3975,
"step": 3232
},
{
"epoch": 1.5981955258929674,
"grad_norm": 0.10865590804459598,
"learning_rate": 1.3142709468467665e-05,
"loss": 0.4138,
"step": 3233
},
{
"epoch": 1.5986899023606478,
"grad_norm": 0.11785647414878586,
"learning_rate": 1.3139012628154033e-05,
"loss": 0.4088,
"step": 3234
},
{
"epoch": 1.5991842788283277,
"grad_norm": 0.10863946589021672,
"learning_rate": 1.3135315311890737e-05,
"loss": 0.4001,
"step": 3235
},
{
"epoch": 1.599678655296008,
"grad_norm": 0.10852494662478582,
"learning_rate": 1.313161752023838e-05,
"loss": 0.4177,
"step": 3236
},
{
"epoch": 1.600173031763688,
"grad_norm": 0.1539889704188668,
"learning_rate": 1.3127919253757637e-05,
"loss": 0.4253,
"step": 3237
},
{
"epoch": 1.6006674082313683,
"grad_norm": 0.11101014571259225,
"learning_rate": 1.3124220513009252e-05,
"loss": 0.38,
"step": 3238
},
{
"epoch": 1.6011617846990482,
"grad_norm": 0.11772946526845883,
"learning_rate": 1.3120521298554043e-05,
"loss": 0.3962,
"step": 3239
},
{
"epoch": 1.6016561611667286,
"grad_norm": 0.10510225638872736,
"learning_rate": 1.3116821610952902e-05,
"loss": 0.3934,
"step": 3240
},
{
"epoch": 1.6021505376344085,
"grad_norm": 0.11079442082852567,
"learning_rate": 1.3113121450766783e-05,
"loss": 0.3968,
"step": 3241
},
{
"epoch": 1.6026449141020889,
"grad_norm": 0.10426793865703024,
"learning_rate": 1.3109420818556731e-05,
"loss": 0.4137,
"step": 3242
},
{
"epoch": 1.6031392905697688,
"grad_norm": 0.11688320180147274,
"learning_rate": 1.3105719714883845e-05,
"loss": 0.4215,
"step": 3243
},
{
"epoch": 1.6036336670374491,
"grad_norm": 0.10703593608936147,
"learning_rate": 1.3102018140309297e-05,
"loss": 0.446,
"step": 3244
},
{
"epoch": 1.604128043505129,
"grad_norm": 0.1145385520244045,
"learning_rate": 1.3098316095394341e-05,
"loss": 0.4076,
"step": 3245
},
{
"epoch": 1.6046224199728094,
"grad_norm": 0.1115806852022362,
"learning_rate": 1.3094613580700295e-05,
"loss": 0.4058,
"step": 3246
},
{
"epoch": 1.6051167964404893,
"grad_norm": 0.10719848713568235,
"learning_rate": 1.3090910596788541e-05,
"loss": 0.4034,
"step": 3247
},
{
"epoch": 1.6056111729081697,
"grad_norm": 0.10794540441743555,
"learning_rate": 1.308720714422055e-05,
"loss": 0.3798,
"step": 3248
},
{
"epoch": 1.6061055493758496,
"grad_norm": 0.11136487876312408,
"learning_rate": 1.3083503223557852e-05,
"loss": 0.4065,
"step": 3249
},
{
"epoch": 1.60659992584353,
"grad_norm": 0.11150771553174561,
"learning_rate": 1.307979883536205e-05,
"loss": 0.4194,
"step": 3250
},
{
"epoch": 1.6070943023112099,
"grad_norm": 0.10338240408029234,
"learning_rate": 1.3076093980194815e-05,
"loss": 0.4041,
"step": 3251
},
{
"epoch": 1.6075886787788902,
"grad_norm": 0.11026239108791308,
"learning_rate": 1.3072388658617896e-05,
"loss": 0.4011,
"step": 3252
},
{
"epoch": 1.6080830552465701,
"grad_norm": 0.10989992529831902,
"learning_rate": 1.3068682871193105e-05,
"loss": 0.4708,
"step": 3253
},
{
"epoch": 1.6085774317142505,
"grad_norm": 0.1629319414960619,
"learning_rate": 1.3064976618482332e-05,
"loss": 0.4387,
"step": 3254
},
{
"epoch": 1.6090718081819304,
"grad_norm": 0.10660954535557798,
"learning_rate": 1.3061269901047528e-05,
"loss": 0.4019,
"step": 3255
},
{
"epoch": 1.6095661846496108,
"grad_norm": 0.10873586700405422,
"learning_rate": 1.3057562719450732e-05,
"loss": 0.4004,
"step": 3256
},
{
"epoch": 1.6100605611172907,
"grad_norm": 0.10921397034933823,
"learning_rate": 1.305385507425403e-05,
"loss": 0.3873,
"step": 3257
},
{
"epoch": 1.610554937584971,
"grad_norm": 0.11495427413762234,
"learning_rate": 1.3050146966019592e-05,
"loss": 0.4142,
"step": 3258
},
{
"epoch": 1.611049314052651,
"grad_norm": 0.1104562707458508,
"learning_rate": 1.3046438395309665e-05,
"loss": 0.4019,
"step": 3259
},
{
"epoch": 1.6115436905203313,
"grad_norm": 0.11978911332594575,
"learning_rate": 1.3042729362686546e-05,
"loss": 0.4048,
"step": 3260
},
{
"epoch": 1.6120380669880112,
"grad_norm": 0.12728824285901089,
"learning_rate": 1.3039019868712617e-05,
"loss": 0.4255,
"step": 3261
},
{
"epoch": 1.6125324434556916,
"grad_norm": 2.8765257559177653,
"learning_rate": 1.3035309913950332e-05,
"loss": 0.4565,
"step": 3262
},
{
"epoch": 1.6130268199233715,
"grad_norm": 0.11741756635923939,
"learning_rate": 1.30315994989622e-05,
"loss": 0.4229,
"step": 3263
},
{
"epoch": 1.6135211963910518,
"grad_norm": 0.11513522563601386,
"learning_rate": 1.3027888624310816e-05,
"loss": 0.4009,
"step": 3264
},
{
"epoch": 1.6140155728587318,
"grad_norm": 0.19496649652966375,
"learning_rate": 1.3024177290558835e-05,
"loss": 0.4136,
"step": 3265
},
{
"epoch": 1.6145099493264121,
"grad_norm": 0.6860725984989332,
"learning_rate": 1.3020465498268986e-05,
"loss": 0.4313,
"step": 3266
},
{
"epoch": 1.6150043257940923,
"grad_norm": 0.12465818464236052,
"learning_rate": 1.3016753248004064e-05,
"loss": 0.4027,
"step": 3267
},
{
"epoch": 1.6154987022617724,
"grad_norm": 0.126734703837297,
"learning_rate": 1.3013040540326935e-05,
"loss": 0.4427,
"step": 3268
},
{
"epoch": 1.6159930787294525,
"grad_norm": 0.12707263631906154,
"learning_rate": 1.3009327375800536e-05,
"loss": 0.4047,
"step": 3269
},
{
"epoch": 1.6164874551971327,
"grad_norm": 0.1442812167245888,
"learning_rate": 1.300561375498787e-05,
"loss": 0.4065,
"step": 3270
},
{
"epoch": 1.6169818316648128,
"grad_norm": 0.13076308589072083,
"learning_rate": 1.300189967845201e-05,
"loss": 0.402,
"step": 3271
},
{
"epoch": 1.617476208132493,
"grad_norm": 0.1396512358086959,
"learning_rate": 1.2998185146756108e-05,
"loss": 0.4402,
"step": 3272
},
{
"epoch": 1.617970584600173,
"grad_norm": 0.13576915652547894,
"learning_rate": 1.2994470160463367e-05,
"loss": 0.4104,
"step": 3273
},
{
"epoch": 1.6184649610678532,
"grad_norm": 0.12023409585388373,
"learning_rate": 1.2990754720137066e-05,
"loss": 0.4148,
"step": 3274
},
{
"epoch": 1.6189593375355333,
"grad_norm": 0.11440760786057261,
"learning_rate": 1.2987038826340563e-05,
"loss": 0.3936,
"step": 3275
},
{
"epoch": 1.6194537140032135,
"grad_norm": 0.26622416015986744,
"learning_rate": 1.2983322479637277e-05,
"loss": 0.4076,
"step": 3276
},
{
"epoch": 1.6199480904708936,
"grad_norm": 0.11660295870020032,
"learning_rate": 1.2979605680590686e-05,
"loss": 0.4027,
"step": 3277
},
{
"epoch": 1.6204424669385737,
"grad_norm": 0.1167173046701606,
"learning_rate": 1.2975888429764354e-05,
"loss": 0.4131,
"step": 3278
},
{
"epoch": 1.6209368434062539,
"grad_norm": 0.11701079771453772,
"learning_rate": 1.2972170727721904e-05,
"loss": 0.3906,
"step": 3279
},
{
"epoch": 1.621431219873934,
"grad_norm": 0.12096939859273206,
"learning_rate": 1.2968452575027024e-05,
"loss": 0.3823,
"step": 3280
},
{
"epoch": 1.6219255963416142,
"grad_norm": 0.10697452014182178,
"learning_rate": 1.2964733972243484e-05,
"loss": 0.3946,
"step": 3281
},
{
"epoch": 1.6224199728092943,
"grad_norm": 0.11855049236124549,
"learning_rate": 1.2961014919935106e-05,
"loss": 0.4222,
"step": 3282
},
{
"epoch": 1.6229143492769744,
"grad_norm": 0.11615707545147949,
"learning_rate": 1.2957295418665789e-05,
"loss": 0.5271,
"step": 3283
},
{
"epoch": 1.6234087257446546,
"grad_norm": 1.970051212699863,
"learning_rate": 1.2953575468999503e-05,
"loss": 0.4034,
"step": 3284
},
{
"epoch": 1.6239031022123347,
"grad_norm": 0.1295637213796161,
"learning_rate": 1.2949855071500277e-05,
"loss": 0.3884,
"step": 3285
},
{
"epoch": 1.6243974786800148,
"grad_norm": 0.1244551226035887,
"learning_rate": 1.2946134226732215e-05,
"loss": 0.4792,
"step": 3286
},
{
"epoch": 1.624891855147695,
"grad_norm": 0.1755893021146654,
"learning_rate": 1.2942412935259483e-05,
"loss": 0.3876,
"step": 3287
},
{
"epoch": 1.625386231615375,
"grad_norm": 0.14799908845267884,
"learning_rate": 1.293869119764632e-05,
"loss": 0.4297,
"step": 3288
},
{
"epoch": 1.6258806080830552,
"grad_norm": 0.14136291262973774,
"learning_rate": 1.2934969014457037e-05,
"loss": 0.4376,
"step": 3289
},
{
"epoch": 1.6263749845507354,
"grad_norm": 0.13940027923047482,
"learning_rate": 1.2931246386255996e-05,
"loss": 0.3945,
"step": 3290
},
{
"epoch": 1.6268693610184155,
"grad_norm": 0.12182386384652155,
"learning_rate": 1.2927523313607639e-05,
"loss": 0.4446,
"step": 3291
},
{
"epoch": 1.6273637374860956,
"grad_norm": 0.11866425148678614,
"learning_rate": 1.2923799797076484e-05,
"loss": 0.4044,
"step": 3292
},
{
"epoch": 1.6278581139537758,
"grad_norm": 0.12427440004991812,
"learning_rate": 1.292007583722709e-05,
"loss": 0.4259,
"step": 3293
},
{
"epoch": 1.628352490421456,
"grad_norm": 0.12807243022974382,
"learning_rate": 1.2916351434624108e-05,
"loss": 0.4195,
"step": 3294
},
{
"epoch": 1.628846866889136,
"grad_norm": 0.12043702514832418,
"learning_rate": 1.2912626589832247e-05,
"loss": 0.4165,
"step": 3295
},
{
"epoch": 1.6293412433568162,
"grad_norm": 0.11303828215493211,
"learning_rate": 1.2908901303416274e-05,
"loss": 0.3961,
"step": 3296
},
{
"epoch": 1.6298356198244963,
"grad_norm": 0.12837202176617993,
"learning_rate": 1.2905175575941045e-05,
"loss": 0.3977,
"step": 3297
},
{
"epoch": 1.6303299962921765,
"grad_norm": 0.11922179699571635,
"learning_rate": 1.290144940797146e-05,
"loss": 0.4202,
"step": 3298
},
{
"epoch": 1.6308243727598566,
"grad_norm": 0.11456297057773199,
"learning_rate": 1.28977228000725e-05,
"loss": 0.3993,
"step": 3299
},
{
"epoch": 1.6313187492275367,
"grad_norm": 0.1140774084978768,
"learning_rate": 1.2893995752809206e-05,
"loss": 0.4205,
"step": 3300
},
{
"epoch": 1.6318131256952169,
"grad_norm": 0.12291188486105467,
"learning_rate": 1.2890268266746689e-05,
"loss": 0.414,
"step": 3301
},
{
"epoch": 1.632307502162897,
"grad_norm": 0.11570213643371471,
"learning_rate": 1.2886540342450124e-05,
"loss": 0.4171,
"step": 3302
},
{
"epoch": 1.6328018786305771,
"grad_norm": 0.11443262118652517,
"learning_rate": 1.2882811980484755e-05,
"loss": 0.3909,
"step": 3303
},
{
"epoch": 1.6332962550982573,
"grad_norm": 0.11756091179358134,
"learning_rate": 1.287908318141589e-05,
"loss": 0.4326,
"step": 3304
},
{
"epoch": 1.6337906315659376,
"grad_norm": 0.11719127501107372,
"learning_rate": 1.287535394580891e-05,
"loss": 0.4056,
"step": 3305
},
{
"epoch": 1.6342850080336175,
"grad_norm": 0.11076845555293188,
"learning_rate": 1.2871624274229249e-05,
"loss": 0.4266,
"step": 3306
},
{
"epoch": 1.634779384501298,
"grad_norm": 0.120573403846557,
"learning_rate": 1.2867894167242416e-05,
"loss": 0.4044,
"step": 3307
},
{
"epoch": 1.6352737609689778,
"grad_norm": 0.10969231402149232,
"learning_rate": 1.286416362541399e-05,
"loss": 0.4116,
"step": 3308
},
{
"epoch": 1.6357681374366582,
"grad_norm": 0.11139764886465053,
"learning_rate": 1.2860432649309607e-05,
"loss": 0.4135,
"step": 3309
},
{
"epoch": 1.636262513904338,
"grad_norm": 0.1240837160481312,
"learning_rate": 1.2856701239494969e-05,
"loss": 0.4274,
"step": 3310
},
{
"epoch": 1.6367568903720184,
"grad_norm": 0.11809666419413029,
"learning_rate": 1.2852969396535852e-05,
"loss": 0.4139,
"step": 3311
},
{
"epoch": 1.6372512668396983,
"grad_norm": 0.10898390831292792,
"learning_rate": 1.2849237120998094e-05,
"loss": 0.3952,
"step": 3312
},
{
"epoch": 1.6377456433073787,
"grad_norm": 0.11620462839723944,
"learning_rate": 1.2845504413447597e-05,
"loss": 0.4135,
"step": 3313
},
{
"epoch": 1.6382400197750586,
"grad_norm": 0.11146953984534139,
"learning_rate": 1.2841771274450325e-05,
"loss": 0.4358,
"step": 3314
},
{
"epoch": 1.638734396242739,
"grad_norm": 0.15563468039707976,
"learning_rate": 1.2838037704572315e-05,
"loss": 0.4007,
"step": 3315
},
{
"epoch": 1.639228772710419,
"grad_norm": 0.11567075630766953,
"learning_rate": 1.2834303704379665e-05,
"loss": 0.4178,
"step": 3316
},
{
"epoch": 1.6397231491780992,
"grad_norm": 0.1075134335913693,
"learning_rate": 1.283056927443854e-05,
"loss": 0.4148,
"step": 3317
},
{
"epoch": 1.6402175256457792,
"grad_norm": 0.11420572602225594,
"learning_rate": 1.2826834415315165e-05,
"loss": 0.3849,
"step": 3318
},
{
"epoch": 1.6407119021134595,
"grad_norm": 0.1145029944408878,
"learning_rate": 1.282309912757584e-05,
"loss": 0.4021,
"step": 3319
},
{
"epoch": 1.6412062785811394,
"grad_norm": 0.10943060793008466,
"learning_rate": 1.2819363411786922e-05,
"loss": 0.4093,
"step": 3320
},
{
"epoch": 1.6417006550488198,
"grad_norm": 0.12167259546511859,
"learning_rate": 1.2815627268514837e-05,
"loss": 0.424,
"step": 3321
},
{
"epoch": 1.6421950315164997,
"grad_norm": 0.11021425937544649,
"learning_rate": 1.2811890698326069e-05,
"loss": 0.4056,
"step": 3322
},
{
"epoch": 1.64268940798418,
"grad_norm": 0.1109924149321394,
"learning_rate": 1.2808153701787172e-05,
"loss": 0.4307,
"step": 3323
},
{
"epoch": 1.64318378445186,
"grad_norm": 0.12566849858841567,
"learning_rate": 1.2804416279464771e-05,
"loss": 0.4186,
"step": 3324
},
{
"epoch": 1.6436781609195403,
"grad_norm": 0.11583420883386869,
"learning_rate": 1.2800678431925546e-05,
"loss": 0.3821,
"step": 3325
},
{
"epoch": 1.6441725373872202,
"grad_norm": 0.11724402886298166,
"learning_rate": 1.279694015973624e-05,
"loss": 0.3983,
"step": 3326
},
{
"epoch": 1.6446669138549006,
"grad_norm": 0.11804867813153595,
"learning_rate": 1.2793201463463671e-05,
"loss": 0.4077,
"step": 3327
},
{
"epoch": 1.6451612903225805,
"grad_norm": 0.11188387913579331,
"learning_rate": 1.2789462343674712e-05,
"loss": 0.4013,
"step": 3328
},
{
"epoch": 1.6456556667902609,
"grad_norm": 0.1092581277086804,
"learning_rate": 1.2785722800936302e-05,
"loss": 0.4106,
"step": 3329
},
{
"epoch": 1.6461500432579408,
"grad_norm": 0.11644147651779595,
"learning_rate": 1.2781982835815449e-05,
"loss": 0.4268,
"step": 3330
},
{
"epoch": 1.6466444197256211,
"grad_norm": 0.11540793402067216,
"learning_rate": 1.2778242448879219e-05,
"loss": 0.4103,
"step": 3331
},
{
"epoch": 1.647138796193301,
"grad_norm": 0.11270262606581795,
"learning_rate": 1.2774501640694746e-05,
"loss": 0.4181,
"step": 3332
},
{
"epoch": 1.6476331726609814,
"grad_norm": 0.10886237192440587,
"learning_rate": 1.2770760411829223e-05,
"loss": 0.389,
"step": 3333
},
{
"epoch": 1.6481275491286613,
"grad_norm": 0.10705872925167224,
"learning_rate": 1.2767018762849915e-05,
"loss": 0.4106,
"step": 3334
},
{
"epoch": 1.6486219255963417,
"grad_norm": 0.10716219425611613,
"learning_rate": 1.2763276694324143e-05,
"loss": 0.4203,
"step": 3335
},
{
"epoch": 1.6491163020640216,
"grad_norm": 0.11517587117634273,
"learning_rate": 1.2759534206819293e-05,
"loss": 0.46,
"step": 3336
},
{
"epoch": 1.649610678531702,
"grad_norm": 0.1916503335240056,
"learning_rate": 1.2755791300902816e-05,
"loss": 0.4221,
"step": 3337
},
{
"epoch": 1.6501050549993819,
"grad_norm": 0.1142912477168473,
"learning_rate": 1.2752047977142232e-05,
"loss": 0.4177,
"step": 3338
},
{
"epoch": 1.6505994314670622,
"grad_norm": 0.13021647284472113,
"learning_rate": 1.2748304236105114e-05,
"loss": 0.4537,
"step": 3339
},
{
"epoch": 1.6510938079347421,
"grad_norm": 0.11796207083977855,
"learning_rate": 1.27445600783591e-05,
"loss": 0.3854,
"step": 3340
},
{
"epoch": 1.6515881844024225,
"grad_norm": 0.10584066952783797,
"learning_rate": 1.2740815504471904e-05,
"loss": 0.3846,
"step": 3341
},
{
"epoch": 1.6520825608701026,
"grad_norm": 0.1111077208344759,
"learning_rate": 1.2737070515011284e-05,
"loss": 0.4408,
"step": 3342
},
{
"epoch": 1.6525769373377828,
"grad_norm": 0.11596313389085344,
"learning_rate": 1.2733325110545071e-05,
"loss": 0.4231,
"step": 3343
},
{
"epoch": 1.653071313805463,
"grad_norm": 0.11270511841081961,
"learning_rate": 1.2729579291641164e-05,
"loss": 0.4159,
"step": 3344
},
{
"epoch": 1.653565690273143,
"grad_norm": 0.11167785335353943,
"learning_rate": 1.2725833058867514e-05,
"loss": 0.4103,
"step": 3345
},
{
"epoch": 1.6540600667408232,
"grad_norm": 0.11428817524078669,
"learning_rate": 1.2722086412792143e-05,
"loss": 0.4272,
"step": 3346
},
{
"epoch": 1.6545544432085033,
"grad_norm": 0.11215450479010218,
"learning_rate": 1.271833935398313e-05,
"loss": 0.4113,
"step": 3347
},
{
"epoch": 1.6550488196761834,
"grad_norm": 0.11222842256895355,
"learning_rate": 1.2714591883008622e-05,
"loss": 0.3867,
"step": 3348
},
{
"epoch": 1.6555431961438636,
"grad_norm": 0.10924203336683158,
"learning_rate": 1.2710844000436822e-05,
"loss": 0.3895,
"step": 3349
},
{
"epoch": 1.6560375726115437,
"grad_norm": 0.11006562012752505,
"learning_rate": 1.2707095706836001e-05,
"loss": 0.3955,
"step": 3350
},
{
"epoch": 1.6565319490792239,
"grad_norm": 0.11981833834262516,
"learning_rate": 1.2703347002774491e-05,
"loss": 0.4526,
"step": 3351
},
{
"epoch": 1.657026325546904,
"grad_norm": 0.3869361783869001,
"learning_rate": 1.2699597888820682e-05,
"loss": 0.389,
"step": 3352
},
{
"epoch": 1.6575207020145841,
"grad_norm": 0.11781914420562228,
"learning_rate": 1.2695848365543032e-05,
"loss": 0.4236,
"step": 3353
},
{
"epoch": 1.6580150784822643,
"grad_norm": 0.11750525401227778,
"learning_rate": 1.2692098433510064e-05,
"loss": 0.4277,
"step": 3354
},
{
"epoch": 1.6585094549499444,
"grad_norm": 0.1114446854041541,
"learning_rate": 1.268834809329035e-05,
"loss": 0.4463,
"step": 3355
},
{
"epoch": 1.6590038314176245,
"grad_norm": 0.11465194577015571,
"learning_rate": 1.2684597345452532e-05,
"loss": 0.3952,
"step": 3356
},
{
"epoch": 1.6594982078853047,
"grad_norm": 0.21330740442332866,
"learning_rate": 1.2680846190565315e-05,
"loss": 0.4101,
"step": 3357
},
{
"epoch": 1.6599925843529848,
"grad_norm": 0.1118836109870138,
"learning_rate": 1.267709462919747e-05,
"loss": 0.4297,
"step": 3358
},
{
"epoch": 1.660486960820665,
"grad_norm": 0.17677901549269007,
"learning_rate": 1.2673342661917811e-05,
"loss": 0.4125,
"step": 3359
},
{
"epoch": 1.660981337288345,
"grad_norm": 0.11155113687650849,
"learning_rate": 1.2669590289295239e-05,
"loss": 0.3996,
"step": 3360
},
{
"epoch": 1.6614757137560252,
"grad_norm": 0.11375504716840325,
"learning_rate": 1.26658375118987e-05,
"loss": 0.3993,
"step": 3361
},
{
"epoch": 1.6619700902237053,
"grad_norm": 0.11685234969853397,
"learning_rate": 1.26620843302972e-05,
"loss": 0.4227,
"step": 3362
},
{
"epoch": 1.6624644666913855,
"grad_norm": 0.11431538797034678,
"learning_rate": 1.2658330745059815e-05,
"loss": 0.4046,
"step": 3363
},
{
"epoch": 1.6629588431590656,
"grad_norm": 0.11488517013795163,
"learning_rate": 1.2654576756755681e-05,
"loss": 0.4165,
"step": 3364
},
{
"epoch": 1.6634532196267457,
"grad_norm": 0.10567227268043877,
"learning_rate": 1.2650822365953988e-05,
"loss": 0.4014,
"step": 3365
},
{
"epoch": 1.6639475960944259,
"grad_norm": 0.12058569875310542,
"learning_rate": 1.2647067573223995e-05,
"loss": 0.4268,
"step": 3366
},
{
"epoch": 1.664441972562106,
"grad_norm": 0.11976106535272041,
"learning_rate": 1.2643312379135018e-05,
"loss": 0.4006,
"step": 3367
},
{
"epoch": 1.6649363490297862,
"grad_norm": 0.11914794334763465,
"learning_rate": 1.2639556784256435e-05,
"loss": 0.4211,
"step": 3368
},
{
"epoch": 1.6654307254974663,
"grad_norm": 0.11688935446454532,
"learning_rate": 1.2635800789157683e-05,
"loss": 0.3851,
"step": 3369
},
{
"epoch": 1.6659251019651464,
"grad_norm": 0.11208297947247467,
"learning_rate": 1.2632044394408265e-05,
"loss": 0.3996,
"step": 3370
},
{
"epoch": 1.6664194784328266,
"grad_norm": 0.10634478512255993,
"learning_rate": 1.2628287600577734e-05,
"loss": 0.3834,
"step": 3371
},
{
"epoch": 1.6669138549005067,
"grad_norm": 0.10824754766311175,
"learning_rate": 1.2624530408235716e-05,
"loss": 0.396,
"step": 3372
},
{
"epoch": 1.6674082313681868,
"grad_norm": 0.10978548689909556,
"learning_rate": 1.2620772817951883e-05,
"loss": 0.4311,
"step": 3373
},
{
"epoch": 1.667902607835867,
"grad_norm": 0.11860369865212368,
"learning_rate": 1.2617014830295991e-05,
"loss": 0.4072,
"step": 3374
},
{
"epoch": 1.668396984303547,
"grad_norm": 0.10867216990090212,
"learning_rate": 1.2613256445837823e-05,
"loss": 0.4051,
"step": 3375
},
{
"epoch": 1.6688913607712272,
"grad_norm": 0.11349498734523708,
"learning_rate": 1.2609497665147254e-05,
"loss": 0.3966,
"step": 3376
},
{
"epoch": 1.6693857372389074,
"grad_norm": 0.1124203131625371,
"learning_rate": 1.2605738488794204e-05,
"loss": 0.4169,
"step": 3377
},
{
"epoch": 1.6698801137065877,
"grad_norm": 0.10986894025475842,
"learning_rate": 1.2601978917348646e-05,
"loss": 0.391,
"step": 3378
},
{
"epoch": 1.6703744901742676,
"grad_norm": 0.11037338140707197,
"learning_rate": 1.259821895138063e-05,
"loss": 0.394,
"step": 3379
},
{
"epoch": 1.670868866641948,
"grad_norm": 0.11130264742248358,
"learning_rate": 1.259445859146025e-05,
"loss": 0.4179,
"step": 3380
},
{
"epoch": 1.671363243109628,
"grad_norm": 0.10948805632803947,
"learning_rate": 1.2590697838157673e-05,
"loss": 0.4469,
"step": 3381
},
{
"epoch": 1.6718576195773083,
"grad_norm": 0.11254383642175624,
"learning_rate": 1.2586936692043118e-05,
"loss": 0.4186,
"step": 3382
},
{
"epoch": 1.6723519960449882,
"grad_norm": 0.11046250711497597,
"learning_rate": 1.2583175153686859e-05,
"loss": 0.4166,
"step": 3383
},
{
"epoch": 1.6728463725126685,
"grad_norm": 0.10973751447391326,
"learning_rate": 1.2579413223659245e-05,
"loss": 0.4221,
"step": 3384
},
{
"epoch": 1.6733407489803485,
"grad_norm": 0.11051193610513957,
"learning_rate": 1.257565090253067e-05,
"loss": 0.4006,
"step": 3385
},
{
"epoch": 1.6738351254480288,
"grad_norm": 0.10689593562268299,
"learning_rate": 1.2571888190871588e-05,
"loss": 0.4133,
"step": 3386
},
{
"epoch": 1.6743295019157087,
"grad_norm": 0.11077476978666194,
"learning_rate": 1.2568125089252525e-05,
"loss": 0.4119,
"step": 3387
},
{
"epoch": 1.674823878383389,
"grad_norm": 0.11065661174732476,
"learning_rate": 1.2564361598244052e-05,
"loss": 0.3982,
"step": 3388
},
{
"epoch": 1.675318254851069,
"grad_norm": 0.11230683383398314,
"learning_rate": 1.2560597718416805e-05,
"loss": 0.4223,
"step": 3389
},
{
"epoch": 1.6758126313187494,
"grad_norm": 0.10656935430940703,
"learning_rate": 1.2556833450341484e-05,
"loss": 0.3932,
"step": 3390
},
{
"epoch": 1.6763070077864293,
"grad_norm": 0.10405494692127044,
"learning_rate": 1.2553068794588834e-05,
"loss": 0.3842,
"step": 3391
},
{
"epoch": 1.6768013842541096,
"grad_norm": 0.10894668683852038,
"learning_rate": 1.2549303751729669e-05,
"loss": 0.404,
"step": 3392
},
{
"epoch": 1.6772957607217895,
"grad_norm": 0.1075231633503141,
"learning_rate": 1.2545538322334867e-05,
"loss": 0.4115,
"step": 3393
},
{
"epoch": 1.67779013718947,
"grad_norm": 0.10401067004152814,
"learning_rate": 1.2541772506975349e-05,
"loss": 0.3898,
"step": 3394
},
{
"epoch": 1.6782845136571498,
"grad_norm": 0.11117422173647686,
"learning_rate": 1.2538006306222108e-05,
"loss": 0.4229,
"step": 3395
},
{
"epoch": 1.6787788901248302,
"grad_norm": 0.11273469178479716,
"learning_rate": 1.2534239720646188e-05,
"loss": 0.4203,
"step": 3396
},
{
"epoch": 1.67927326659251,
"grad_norm": 0.10596484896548869,
"learning_rate": 1.2530472750818696e-05,
"loss": 0.431,
"step": 3397
},
{
"epoch": 1.6797676430601904,
"grad_norm": 0.12021309613897632,
"learning_rate": 1.2526705397310794e-05,
"loss": 0.4064,
"step": 3398
},
{
"epoch": 1.6802620195278704,
"grad_norm": 0.10940185385452944,
"learning_rate": 1.2522937660693701e-05,
"loss": 0.4355,
"step": 3399
},
{
"epoch": 1.6807563959955507,
"grad_norm": 0.11490376189829365,
"learning_rate": 1.2519169541538701e-05,
"loss": 0.3746,
"step": 3400
},
{
"epoch": 1.6812507724632306,
"grad_norm": 0.10607807702990167,
"learning_rate": 1.2515401040417126e-05,
"loss": 0.404,
"step": 3401
},
{
"epoch": 1.681745148930911,
"grad_norm": 0.11389095039339461,
"learning_rate": 1.2511632157900375e-05,
"loss": 0.404,
"step": 3402
},
{
"epoch": 1.682239525398591,
"grad_norm": 0.11074979538184139,
"learning_rate": 1.2507862894559899e-05,
"loss": 0.3864,
"step": 3403
},
{
"epoch": 1.6827339018662713,
"grad_norm": 0.1069182015587074,
"learning_rate": 1.2504093250967211e-05,
"loss": 0.4541,
"step": 3404
},
{
"epoch": 1.6832282783339512,
"grad_norm": 0.11070696184265623,
"learning_rate": 1.2500323227693873e-05,
"loss": 0.3936,
"step": 3405
},
{
"epoch": 1.6837226548016315,
"grad_norm": 0.109973748379807,
"learning_rate": 1.2496552825311521e-05,
"loss": 0.4027,
"step": 3406
},
{
"epoch": 1.6842170312693114,
"grad_norm": 0.1191076841257813,
"learning_rate": 1.2492782044391835e-05,
"loss": 0.3982,
"step": 3407
},
{
"epoch": 1.6847114077369918,
"grad_norm": 0.1094067782560332,
"learning_rate": 1.2489010885506552e-05,
"loss": 0.4395,
"step": 3408
},
{
"epoch": 1.6852057842046717,
"grad_norm": 0.1177000908059843,
"learning_rate": 1.2485239349227471e-05,
"loss": 0.4098,
"step": 3409
},
{
"epoch": 1.685700160672352,
"grad_norm": 0.5266856363065892,
"learning_rate": 1.2481467436126455e-05,
"loss": 0.4216,
"step": 3410
},
{
"epoch": 1.686194537140032,
"grad_norm": 0.11280636099434309,
"learning_rate": 1.2477695146775406e-05,
"loss": 0.4096,
"step": 3411
},
{
"epoch": 1.6866889136077123,
"grad_norm": 0.1160163416904608,
"learning_rate": 1.2473922481746299e-05,
"loss": 0.3933,
"step": 3412
},
{
"epoch": 1.6871832900753923,
"grad_norm": 0.10707418467748957,
"learning_rate": 1.2470149441611161e-05,
"loss": 0.3873,
"step": 3413
},
{
"epoch": 1.6876776665430726,
"grad_norm": 0.11833000338773664,
"learning_rate": 1.2466376026942072e-05,
"loss": 0.4037,
"step": 3414
},
{
"epoch": 1.6881720430107527,
"grad_norm": 0.10772502512808785,
"learning_rate": 1.2462602238311177e-05,
"loss": 0.4129,
"step": 3415
},
{
"epoch": 1.6886664194784329,
"grad_norm": 0.10789229948281512,
"learning_rate": 1.245882807629067e-05,
"loss": 0.4068,
"step": 3416
},
{
"epoch": 1.689160795946113,
"grad_norm": 0.12158745095551446,
"learning_rate": 1.2455053541452806e-05,
"loss": 0.4253,
"step": 3417
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.14082526135454934,
"learning_rate": 1.2451278634369892e-05,
"loss": 0.4047,
"step": 3418
},
{
"epoch": 1.6901495488814733,
"grad_norm": 0.1085783270226596,
"learning_rate": 1.2447503355614296e-05,
"loss": 0.3879,
"step": 3419
},
{
"epoch": 1.6906439253491534,
"grad_norm": 0.12285244114929704,
"learning_rate": 1.2443727705758448e-05,
"loss": 0.4148,
"step": 3420
},
{
"epoch": 1.6911383018168336,
"grad_norm": 0.1221115815172085,
"learning_rate": 1.2439951685374816e-05,
"loss": 0.3889,
"step": 3421
},
{
"epoch": 1.6916326782845137,
"grad_norm": 0.11181159814562897,
"learning_rate": 1.2436175295035939e-05,
"loss": 0.3948,
"step": 3422
},
{
"epoch": 1.6921270547521938,
"grad_norm": 0.11995210717817188,
"learning_rate": 1.2432398535314412e-05,
"loss": 0.3897,
"step": 3423
},
{
"epoch": 1.692621431219874,
"grad_norm": 0.1294386758059145,
"learning_rate": 1.242862140678288e-05,
"loss": 0.4007,
"step": 3424
},
{
"epoch": 1.693115807687554,
"grad_norm": 0.1071188702149494,
"learning_rate": 1.2424843910014044e-05,
"loss": 0.4087,
"step": 3425
},
{
"epoch": 1.6936101841552342,
"grad_norm": 0.11588642423853428,
"learning_rate": 1.2421066045580665e-05,
"loss": 0.4032,
"step": 3426
},
{
"epoch": 1.6941045606229144,
"grad_norm": 0.11719343788762215,
"learning_rate": 1.2417287814055561e-05,
"loss": 0.3881,
"step": 3427
},
{
"epoch": 1.6945989370905945,
"grad_norm": 0.10579822076497201,
"learning_rate": 1.24135092160116e-05,
"loss": 0.4251,
"step": 3428
},
{
"epoch": 1.6950933135582746,
"grad_norm": 0.11283296078885288,
"learning_rate": 1.2409730252021709e-05,
"loss": 0.4089,
"step": 3429
},
{
"epoch": 1.6955876900259548,
"grad_norm": 0.11278316737380024,
"learning_rate": 1.2405950922658865e-05,
"loss": 0.4274,
"step": 3430
},
{
"epoch": 1.696082066493635,
"grad_norm": 0.11797534820545347,
"learning_rate": 1.2402171228496111e-05,
"loss": 0.3941,
"step": 3431
},
{
"epoch": 1.696576442961315,
"grad_norm": 0.10704007087884868,
"learning_rate": 1.2398391170106539e-05,
"loss": 0.4116,
"step": 3432
},
{
"epoch": 1.6970708194289952,
"grad_norm": 0.11123642257447018,
"learning_rate": 1.2394610748063292e-05,
"loss": 0.4285,
"step": 3433
},
{
"epoch": 1.6975651958966753,
"grad_norm": 0.11116792904061483,
"learning_rate": 1.2390829962939576e-05,
"loss": 0.4184,
"step": 3434
},
{
"epoch": 1.6980595723643555,
"grad_norm": 0.10845344470821675,
"learning_rate": 1.238704881530865e-05,
"loss": 0.4261,
"step": 3435
},
{
"epoch": 1.6985539488320356,
"grad_norm": 0.11608438829388514,
"learning_rate": 1.2383267305743825e-05,
"loss": 0.4026,
"step": 3436
},
{
"epoch": 1.6990483252997157,
"grad_norm": 0.10605339244166137,
"learning_rate": 1.2379485434818468e-05,
"loss": 0.3838,
"step": 3437
},
{
"epoch": 1.6995427017673959,
"grad_norm": 0.10555463469921417,
"learning_rate": 1.2375703203106e-05,
"loss": 0.4069,
"step": 3438
},
{
"epoch": 1.700037078235076,
"grad_norm": 0.10959659557581725,
"learning_rate": 1.2371920611179902e-05,
"loss": 0.419,
"step": 3439
},
{
"epoch": 1.7005314547027561,
"grad_norm": 0.10956201264762874,
"learning_rate": 1.2368137659613706e-05,
"loss": 0.3985,
"step": 3440
},
{
"epoch": 1.7010258311704363,
"grad_norm": 0.11610109272169974,
"learning_rate": 1.2364354348980993e-05,
"loss": 0.3965,
"step": 3441
},
{
"epoch": 1.7015202076381164,
"grad_norm": 0.10753958676541772,
"learning_rate": 1.2360570679855407e-05,
"loss": 0.395,
"step": 3442
},
{
"epoch": 1.7020145841057965,
"grad_norm": 0.15066420959663815,
"learning_rate": 1.2356786652810649e-05,
"loss": 0.3759,
"step": 3443
},
{
"epoch": 1.7025089605734767,
"grad_norm": 0.10731763574356844,
"learning_rate": 1.2353002268420454e-05,
"loss": 0.3873,
"step": 3444
},
{
"epoch": 1.7030033370411568,
"grad_norm": 0.12428554481011363,
"learning_rate": 1.2349217527258638e-05,
"loss": 0.3899,
"step": 3445
},
{
"epoch": 1.703497713508837,
"grad_norm": 0.11472738773397119,
"learning_rate": 1.2345432429899053e-05,
"loss": 0.4426,
"step": 3446
},
{
"epoch": 1.703992089976517,
"grad_norm": 0.11736503731080869,
"learning_rate": 1.2341646976915614e-05,
"loss": 0.3798,
"step": 3447
},
{
"epoch": 1.7044864664441972,
"grad_norm": 0.10451511562404776,
"learning_rate": 1.2337861168882284e-05,
"loss": 0.3898,
"step": 3448
},
{
"epoch": 1.7049808429118773,
"grad_norm": 0.10806479663444862,
"learning_rate": 1.2334075006373084e-05,
"loss": 0.4123,
"step": 3449
},
{
"epoch": 1.7054752193795575,
"grad_norm": 0.12149555298721217,
"learning_rate": 1.2330288489962083e-05,
"loss": 0.3981,
"step": 3450
},
{
"epoch": 1.7059695958472376,
"grad_norm": 0.11251227382441377,
"learning_rate": 1.2326501620223412e-05,
"loss": 0.4007,
"step": 3451
},
{
"epoch": 1.7064639723149178,
"grad_norm": 0.11000090792568955,
"learning_rate": 1.232271439773125e-05,
"loss": 0.3707,
"step": 3452
},
{
"epoch": 1.7069583487825981,
"grad_norm": 0.10328631812743343,
"learning_rate": 1.2318926823059834e-05,
"loss": 0.4041,
"step": 3453
},
{
"epoch": 1.707452725250278,
"grad_norm": 0.11402735834890845,
"learning_rate": 1.2315138896783445e-05,
"loss": 0.4066,
"step": 3454
},
{
"epoch": 1.7079471017179584,
"grad_norm": 0.10665182132283538,
"learning_rate": 1.2311350619476425e-05,
"loss": 0.4001,
"step": 3455
},
{
"epoch": 1.7084414781856383,
"grad_norm": 0.11128806363135277,
"learning_rate": 1.2307561991713175e-05,
"loss": 0.3866,
"step": 3456
},
{
"epoch": 1.7089358546533187,
"grad_norm": 0.10637613985231963,
"learning_rate": 1.2303773014068132e-05,
"loss": 0.4369,
"step": 3457
},
{
"epoch": 1.7094302311209986,
"grad_norm": 0.11504874132791847,
"learning_rate": 1.2299983687115804e-05,
"loss": 0.4103,
"step": 3458
},
{
"epoch": 1.709924607588679,
"grad_norm": 0.10586490626041599,
"learning_rate": 1.229619401143074e-05,
"loss": 0.3941,
"step": 3459
},
{
"epoch": 1.7104189840563588,
"grad_norm": 0.10932626979136946,
"learning_rate": 1.2292403987587544e-05,
"loss": 0.3907,
"step": 3460
},
{
"epoch": 1.7109133605240392,
"grad_norm": 0.10951444460834095,
"learning_rate": 1.2288613616160878e-05,
"loss": 0.406,
"step": 3461
},
{
"epoch": 1.711407736991719,
"grad_norm": 0.10976262763724895,
"learning_rate": 1.2284822897725453e-05,
"loss": 0.3854,
"step": 3462
},
{
"epoch": 1.7119021134593995,
"grad_norm": 0.11030721650117685,
"learning_rate": 1.228103183285603e-05,
"loss": 0.4236,
"step": 3463
},
{
"epoch": 1.7123964899270794,
"grad_norm": 0.2042077518050935,
"learning_rate": 1.227724042212743e-05,
"loss": 0.3917,
"step": 3464
},
{
"epoch": 1.7128908663947597,
"grad_norm": 0.18711710071052845,
"learning_rate": 1.2273448666114516e-05,
"loss": 0.4086,
"step": 3465
},
{
"epoch": 1.7133852428624397,
"grad_norm": 0.11075984428666354,
"learning_rate": 1.2269656565392216e-05,
"loss": 0.4122,
"step": 3466
},
{
"epoch": 1.71387961933012,
"grad_norm": 0.11513721152775458,
"learning_rate": 1.2265864120535498e-05,
"loss": 0.4148,
"step": 3467
},
{
"epoch": 1.7143739957978,
"grad_norm": 0.11557523980474378,
"learning_rate": 1.2262071332119387e-05,
"loss": 0.3982,
"step": 3468
},
{
"epoch": 1.7148683722654803,
"grad_norm": 0.10774051235954518,
"learning_rate": 1.2258278200718969e-05,
"loss": 0.3678,
"step": 3469
},
{
"epoch": 1.7153627487331602,
"grad_norm": 0.10372938467166717,
"learning_rate": 1.2254484726909366e-05,
"loss": 0.3866,
"step": 3470
},
{
"epoch": 1.7158571252008405,
"grad_norm": 0.1106754418288557,
"learning_rate": 1.2250690911265762e-05,
"loss": 0.4421,
"step": 3471
},
{
"epoch": 1.7163515016685205,
"grad_norm": 0.11285238509283253,
"learning_rate": 1.2246896754363391e-05,
"loss": 0.4037,
"step": 3472
},
{
"epoch": 1.7168458781362008,
"grad_norm": 0.11856122685504536,
"learning_rate": 1.2243102256777537e-05,
"loss": 0.4077,
"step": 3473
},
{
"epoch": 1.7173402546038807,
"grad_norm": 0.10780767675484224,
"learning_rate": 1.2239307419083534e-05,
"loss": 0.3984,
"step": 3474
},
{
"epoch": 1.717834631071561,
"grad_norm": 0.10593491941705009,
"learning_rate": 1.223551224185678e-05,
"loss": 0.3955,
"step": 3475
},
{
"epoch": 1.718329007539241,
"grad_norm": 0.11640748808113666,
"learning_rate": 1.2231716725672707e-05,
"loss": 0.4053,
"step": 3476
},
{
"epoch": 1.7188233840069214,
"grad_norm": 0.11471839061309415,
"learning_rate": 1.2227920871106806e-05,
"loss": 0.4118,
"step": 3477
},
{
"epoch": 1.7193177604746013,
"grad_norm": 0.10799126973932476,
"learning_rate": 1.2224124678734625e-05,
"loss": 0.389,
"step": 3478
},
{
"epoch": 1.7198121369422816,
"grad_norm": 0.12527473565355454,
"learning_rate": 1.2220328149131755e-05,
"loss": 0.3847,
"step": 3479
},
{
"epoch": 1.7203065134099615,
"grad_norm": 0.10948473501825992,
"learning_rate": 1.221653128287384e-05,
"loss": 0.3838,
"step": 3480
},
{
"epoch": 1.720800889877642,
"grad_norm": 0.11126322803097083,
"learning_rate": 1.221273408053658e-05,
"loss": 0.4022,
"step": 3481
},
{
"epoch": 1.7212952663453218,
"grad_norm": 0.11125823564618405,
"learning_rate": 1.2208936542695715e-05,
"loss": 0.3849,
"step": 3482
},
{
"epoch": 1.7217896428130022,
"grad_norm": 0.1163675919977474,
"learning_rate": 1.2205138669927049e-05,
"loss": 0.3823,
"step": 3483
},
{
"epoch": 1.722284019280682,
"grad_norm": 0.14296125753578082,
"learning_rate": 1.2201340462806428e-05,
"loss": 0.4185,
"step": 3484
},
{
"epoch": 1.7227783957483624,
"grad_norm": 0.10750765455181088,
"learning_rate": 1.2197541921909752e-05,
"loss": 0.4097,
"step": 3485
},
{
"epoch": 1.7232727722160424,
"grad_norm": 0.12784125198986715,
"learning_rate": 1.2193743047812971e-05,
"loss": 0.4524,
"step": 3486
},
{
"epoch": 1.7237671486837227,
"grad_norm": 0.11123408074112284,
"learning_rate": 1.2189943841092084e-05,
"loss": 0.4157,
"step": 3487
},
{
"epoch": 1.7242615251514026,
"grad_norm": 0.11609167265718978,
"learning_rate": 1.2186144302323146e-05,
"loss": 0.3818,
"step": 3488
},
{
"epoch": 1.724755901619083,
"grad_norm": 0.10794771755309408,
"learning_rate": 1.2182344432082256e-05,
"loss": 0.3866,
"step": 3489
},
{
"epoch": 1.7252502780867631,
"grad_norm": 0.10565090809243714,
"learning_rate": 1.2178544230945563e-05,
"loss": 0.3961,
"step": 3490
},
{
"epoch": 1.7257446545544433,
"grad_norm": 0.11608128145500353,
"learning_rate": 1.2174743699489272e-05,
"loss": 0.4585,
"step": 3491
},
{
"epoch": 1.7262390310221234,
"grad_norm": 1.2429838033145497,
"learning_rate": 1.2170942838289637e-05,
"loss": 0.4129,
"step": 3492
},
{
"epoch": 1.7267334074898035,
"grad_norm": 0.11411848176165446,
"learning_rate": 1.2167141647922952e-05,
"loss": 0.4281,
"step": 3493
},
{
"epoch": 1.7272277839574837,
"grad_norm": 0.11488896234472587,
"learning_rate": 1.2163340128965574e-05,
"loss": 0.3965,
"step": 3494
},
{
"epoch": 1.7277221604251638,
"grad_norm": 0.10824455898319875,
"learning_rate": 1.2159538281993906e-05,
"loss": 0.4184,
"step": 3495
},
{
"epoch": 1.728216536892844,
"grad_norm": 0.11787903247840871,
"learning_rate": 1.2155736107584395e-05,
"loss": 0.4027,
"step": 3496
},
{
"epoch": 1.728710913360524,
"grad_norm": 0.11209957312596631,
"learning_rate": 1.2151933606313544e-05,
"loss": 0.3915,
"step": 3497
},
{
"epoch": 1.7292052898282042,
"grad_norm": 0.11542904879199359,
"learning_rate": 1.2148130778757906e-05,
"loss": 0.4332,
"step": 3498
},
{
"epoch": 1.7296996662958843,
"grad_norm": 0.12047741272086702,
"learning_rate": 1.2144327625494077e-05,
"loss": 0.4174,
"step": 3499
},
{
"epoch": 1.7301940427635645,
"grad_norm": 0.12965547812605063,
"learning_rate": 1.2140524147098707e-05,
"loss": 0.4255,
"step": 3500
},
{
"epoch": 1.7306884192312446,
"grad_norm": 0.1272317656539165,
"learning_rate": 1.2136720344148494e-05,
"loss": 0.4078,
"step": 3501
},
{
"epoch": 1.7311827956989247,
"grad_norm": 0.11127096187312663,
"learning_rate": 1.2132916217220189e-05,
"loss": 0.3996,
"step": 3502
},
{
"epoch": 1.7316771721666049,
"grad_norm": 0.10658936018182433,
"learning_rate": 1.2129111766890588e-05,
"loss": 0.4249,
"step": 3503
},
{
"epoch": 1.732171548634285,
"grad_norm": 0.11320250955006601,
"learning_rate": 1.2125306993736535e-05,
"loss": 0.388,
"step": 3504
},
{
"epoch": 1.7326659251019652,
"grad_norm": 0.11548036278930818,
"learning_rate": 1.2121501898334926e-05,
"loss": 0.41,
"step": 3505
},
{
"epoch": 1.7331603015696453,
"grad_norm": 0.10559122483399942,
"learning_rate": 1.2117696481262706e-05,
"loss": 0.4139,
"step": 3506
},
{
"epoch": 1.7336546780373254,
"grad_norm": 0.11321499775445559,
"learning_rate": 1.2113890743096863e-05,
"loss": 0.4014,
"step": 3507
},
{
"epoch": 1.7341490545050056,
"grad_norm": 0.10665845082687256,
"learning_rate": 1.2110084684414445e-05,
"loss": 0.3992,
"step": 3508
},
{
"epoch": 1.7346434309726857,
"grad_norm": 0.10779514473396364,
"learning_rate": 1.2106278305792536e-05,
"loss": 0.4289,
"step": 3509
},
{
"epoch": 1.7351378074403658,
"grad_norm": 0.1075128787274869,
"learning_rate": 1.210247160780828e-05,
"loss": 0.4285,
"step": 3510
},
{
"epoch": 1.735632183908046,
"grad_norm": 0.11676776636654129,
"learning_rate": 1.209866459103886e-05,
"loss": 0.435,
"step": 3511
},
{
"epoch": 1.736126560375726,
"grad_norm": 0.11247277798212957,
"learning_rate": 1.209485725606151e-05,
"loss": 0.3905,
"step": 3512
},
{
"epoch": 1.7366209368434062,
"grad_norm": 0.10799886501068888,
"learning_rate": 1.2091049603453518e-05,
"loss": 0.4034,
"step": 3513
},
{
"epoch": 1.7371153133110864,
"grad_norm": 0.11107859046247831,
"learning_rate": 1.208724163379221e-05,
"loss": 0.419,
"step": 3514
},
{
"epoch": 1.7376096897787665,
"grad_norm": 0.11054550111809995,
"learning_rate": 1.2083433347654968e-05,
"loss": 0.3888,
"step": 3515
},
{
"epoch": 1.7381040662464466,
"grad_norm": 0.10910435999926762,
"learning_rate": 1.2079624745619223e-05,
"loss": 0.393,
"step": 3516
},
{
"epoch": 1.7385984427141268,
"grad_norm": 0.10556349361065569,
"learning_rate": 1.2075815828262443e-05,
"loss": 0.4105,
"step": 3517
},
{
"epoch": 1.739092819181807,
"grad_norm": 0.11207293909766901,
"learning_rate": 1.207200659616216e-05,
"loss": 0.3965,
"step": 3518
},
{
"epoch": 1.739587195649487,
"grad_norm": 0.11072241133403085,
"learning_rate": 1.206819704989594e-05,
"loss": 0.4179,
"step": 3519
},
{
"epoch": 1.7400815721171672,
"grad_norm": 0.10948018582927471,
"learning_rate": 1.20643871900414e-05,
"loss": 0.3778,
"step": 3520
},
{
"epoch": 1.7405759485848473,
"grad_norm": 0.11451209845519929,
"learning_rate": 1.206057701717621e-05,
"loss": 0.3949,
"step": 3521
},
{
"epoch": 1.7410703250525275,
"grad_norm": 0.11090559458877967,
"learning_rate": 1.2056766531878083e-05,
"loss": 0.4302,
"step": 3522
},
{
"epoch": 1.7415647015202076,
"grad_norm": 0.11778776048050062,
"learning_rate": 1.2052955734724777e-05,
"loss": 0.395,
"step": 3523
},
{
"epoch": 1.7420590779878877,
"grad_norm": 0.11319135381768243,
"learning_rate": 1.2049144626294105e-05,
"loss": 0.4104,
"step": 3524
},
{
"epoch": 1.7425534544555679,
"grad_norm": 0.11672383032967702,
"learning_rate": 1.2045333207163923e-05,
"loss": 0.3844,
"step": 3525
},
{
"epoch": 1.743047830923248,
"grad_norm": 0.13363919108022282,
"learning_rate": 1.2041521477912124e-05,
"loss": 0.4258,
"step": 3526
},
{
"epoch": 1.7435422073909281,
"grad_norm": 0.10658161021316732,
"learning_rate": 1.2037709439116669e-05,
"loss": 0.3988,
"step": 3527
},
{
"epoch": 1.7440365838586085,
"grad_norm": 0.10842433164104287,
"learning_rate": 1.2033897091355548e-05,
"loss": 0.3702,
"step": 3528
},
{
"epoch": 1.7445309603262884,
"grad_norm": 0.10848158581972528,
"learning_rate": 1.2030084435206809e-05,
"loss": 0.3988,
"step": 3529
},
{
"epoch": 1.7450253367939688,
"grad_norm": 0.11342904351403452,
"learning_rate": 1.2026271471248536e-05,
"loss": 0.4146,
"step": 3530
},
{
"epoch": 1.7455197132616487,
"grad_norm": 0.10646881741226623,
"learning_rate": 1.2022458200058873e-05,
"loss": 0.4054,
"step": 3531
},
{
"epoch": 1.746014089729329,
"grad_norm": 0.11604227157333238,
"learning_rate": 1.2018644622215998e-05,
"loss": 0.3858,
"step": 3532
},
{
"epoch": 1.746508466197009,
"grad_norm": 0.10547604226700774,
"learning_rate": 1.2014830738298145e-05,
"loss": 0.4287,
"step": 3533
},
{
"epoch": 1.7470028426646893,
"grad_norm": 0.11182639300930158,
"learning_rate": 1.2011016548883585e-05,
"loss": 0.3961,
"step": 3534
},
{
"epoch": 1.7474972191323692,
"grad_norm": 0.11227452680563174,
"learning_rate": 1.2007202054550646e-05,
"loss": 0.4528,
"step": 3535
},
{
"epoch": 1.7479915956000496,
"grad_norm": 0.11356838988184775,
"learning_rate": 1.2003387255877695e-05,
"loss": 0.4057,
"step": 3536
},
{
"epoch": 1.7484859720677295,
"grad_norm": 0.10926473678518504,
"learning_rate": 1.1999572153443142e-05,
"loss": 0.4026,
"step": 3537
},
{
"epoch": 1.7489803485354098,
"grad_norm": 0.1068174397746832,
"learning_rate": 1.199575674782546e-05,
"loss": 0.3962,
"step": 3538
},
{
"epoch": 1.7494747250030898,
"grad_norm": 0.10678213678720214,
"learning_rate": 1.1991941039603144e-05,
"loss": 0.4299,
"step": 3539
},
{
"epoch": 1.7499691014707701,
"grad_norm": 0.1104351935304358,
"learning_rate": 1.1988125029354753e-05,
"loss": 0.4073,
"step": 3540
},
{
"epoch": 1.75046347793845,
"grad_norm": 0.1598699257579217,
"learning_rate": 1.198430871765889e-05,
"loss": 0.4244,
"step": 3541
},
{
"epoch": 1.7509578544061304,
"grad_norm": 0.114331794721778,
"learning_rate": 1.1980492105094188e-05,
"loss": 0.3962,
"step": 3542
},
{
"epoch": 1.7509578544061304,
"eval_loss": 0.48693010210990906,
"eval_runtime": 100.9525,
"eval_samples_per_second": 300.676,
"eval_steps_per_second": 37.592,
"step": 3542
},
{
"epoch": 1.7514522308738103,
"grad_norm": 0.10610936867946034,
"learning_rate": 1.1976675192239345e-05,
"loss": 0.4069,
"step": 3543
},
{
"epoch": 1.7519466073414907,
"grad_norm": 0.10925478526524533,
"learning_rate": 1.1972857979673097e-05,
"loss": 0.4393,
"step": 3544
},
{
"epoch": 1.7524409838091706,
"grad_norm": 0.1067239372509578,
"learning_rate": 1.196904046797422e-05,
"loss": 0.4109,
"step": 3545
},
{
"epoch": 1.752935360276851,
"grad_norm": 0.2847790529334459,
"learning_rate": 1.1965222657721545e-05,
"loss": 0.4169,
"step": 3546
},
{
"epoch": 1.7534297367445308,
"grad_norm": 0.11283031944663754,
"learning_rate": 1.1961404549493942e-05,
"loss": 0.4172,
"step": 3547
},
{
"epoch": 1.7539241132122112,
"grad_norm": 0.1144264029867835,
"learning_rate": 1.1957586143870327e-05,
"loss": 0.4034,
"step": 3548
},
{
"epoch": 1.7544184896798911,
"grad_norm": 0.10952541768286897,
"learning_rate": 1.1953767441429664e-05,
"loss": 0.3909,
"step": 3549
},
{
"epoch": 1.7549128661475715,
"grad_norm": 0.10839800277090861,
"learning_rate": 1.1949948442750956e-05,
"loss": 0.4324,
"step": 3550
},
{
"epoch": 1.7554072426152514,
"grad_norm": 0.12118408728839762,
"learning_rate": 1.194612914841326e-05,
"loss": 0.382,
"step": 3551
},
{
"epoch": 1.7559016190829317,
"grad_norm": 0.11273401198437072,
"learning_rate": 1.1942309558995672e-05,
"loss": 0.4175,
"step": 3552
},
{
"epoch": 1.7563959955506117,
"grad_norm": 0.12369713032576074,
"learning_rate": 1.193848967507733e-05,
"loss": 0.39,
"step": 3553
},
{
"epoch": 1.756890372018292,
"grad_norm": 0.10806551281752398,
"learning_rate": 1.1934669497237423e-05,
"loss": 0.4077,
"step": 3554
},
{
"epoch": 1.757384748485972,
"grad_norm": 0.1128730217703653,
"learning_rate": 1.193084902605518e-05,
"loss": 0.4165,
"step": 3555
},
{
"epoch": 1.7578791249536523,
"grad_norm": 0.11128962767310407,
"learning_rate": 1.1927028262109874e-05,
"loss": 0.3624,
"step": 3556
},
{
"epoch": 1.7583735014213322,
"grad_norm": 0.0999546308277465,
"learning_rate": 1.1923207205980829e-05,
"loss": 0.3924,
"step": 3557
},
{
"epoch": 1.7588678778890126,
"grad_norm": 0.11164269607683668,
"learning_rate": 1.1919385858247408e-05,
"loss": 0.4119,
"step": 3558
},
{
"epoch": 1.7593622543566925,
"grad_norm": 0.11462118040616334,
"learning_rate": 1.1915564219489018e-05,
"loss": 0.3982,
"step": 3559
},
{
"epoch": 1.7598566308243728,
"grad_norm": 0.10476053399311057,
"learning_rate": 1.1911742290285111e-05,
"loss": 0.4192,
"step": 3560
},
{
"epoch": 1.7603510072920527,
"grad_norm": 0.11442844635946518,
"learning_rate": 1.1907920071215184e-05,
"loss": 0.3967,
"step": 3561
},
{
"epoch": 1.760845383759733,
"grad_norm": 0.11989910865282769,
"learning_rate": 1.1904097562858776e-05,
"loss": 0.3681,
"step": 3562
},
{
"epoch": 1.761339760227413,
"grad_norm": 0.10658063969781119,
"learning_rate": 1.1900274765795472e-05,
"loss": 0.3854,
"step": 3563
},
{
"epoch": 1.7618341366950934,
"grad_norm": 0.10965094295170055,
"learning_rate": 1.18964516806049e-05,
"loss": 0.3803,
"step": 3564
},
{
"epoch": 1.7623285131627735,
"grad_norm": 0.12359512141364688,
"learning_rate": 1.1892628307866729e-05,
"loss": 0.4249,
"step": 3565
},
{
"epoch": 1.7628228896304536,
"grad_norm": 0.1073049814405806,
"learning_rate": 1.1888804648160677e-05,
"loss": 0.4023,
"step": 3566
},
{
"epoch": 1.7633172660981338,
"grad_norm": 0.1140651648931422,
"learning_rate": 1.1884980702066502e-05,
"loss": 0.4031,
"step": 3567
},
{
"epoch": 1.763811642565814,
"grad_norm": 0.10545439747694907,
"learning_rate": 1.1881156470164006e-05,
"loss": 0.3701,
"step": 3568
},
{
"epoch": 1.764306019033494,
"grad_norm": 0.09765375232480528,
"learning_rate": 1.1877331953033031e-05,
"loss": 0.4053,
"step": 3569
},
{
"epoch": 1.7648003955011742,
"grad_norm": 0.11393618785532221,
"learning_rate": 1.1873507151253472e-05,
"loss": 0.3975,
"step": 3570
},
{
"epoch": 1.7652947719688543,
"grad_norm": 0.11480098517228951,
"learning_rate": 1.1869682065405258e-05,
"loss": 0.4133,
"step": 3571
},
{
"epoch": 1.7657891484365345,
"grad_norm": 0.10738596653169515,
"learning_rate": 1.1865856696068361e-05,
"loss": 0.3689,
"step": 3572
},
{
"epoch": 1.7662835249042146,
"grad_norm": 0.11123765326324048,
"learning_rate": 1.1862031043822802e-05,
"loss": 0.3629,
"step": 3573
},
{
"epoch": 1.7667779013718947,
"grad_norm": 0.10749496607051706,
"learning_rate": 1.1858205109248642e-05,
"loss": 0.4198,
"step": 3574
},
{
"epoch": 1.7672722778395749,
"grad_norm": 0.11660178348724105,
"learning_rate": 1.185437889292598e-05,
"loss": 0.3937,
"step": 3575
},
{
"epoch": 1.767766654307255,
"grad_norm": 0.10575245994508856,
"learning_rate": 1.1850552395434967e-05,
"loss": 0.3968,
"step": 3576
},
{
"epoch": 1.7682610307749351,
"grad_norm": 0.10758550704755816,
"learning_rate": 1.1846725617355789e-05,
"loss": 0.4259,
"step": 3577
},
{
"epoch": 1.7687554072426153,
"grad_norm": 0.11486964156880021,
"learning_rate": 1.1842898559268682e-05,
"loss": 0.412,
"step": 3578
},
{
"epoch": 1.7692497837102954,
"grad_norm": 0.10983083063021684,
"learning_rate": 1.1839071221753916e-05,
"loss": 0.4209,
"step": 3579
},
{
"epoch": 1.7697441601779755,
"grad_norm": 0.11015043775339826,
"learning_rate": 1.1835243605391806e-05,
"loss": 0.3961,
"step": 3580
},
{
"epoch": 1.7702385366456557,
"grad_norm": 0.11389785528461581,
"learning_rate": 1.1831415710762713e-05,
"loss": 0.3782,
"step": 3581
},
{
"epoch": 1.7707329131133358,
"grad_norm": 0.10718142303151368,
"learning_rate": 1.1827587538447036e-05,
"loss": 0.3969,
"step": 3582
},
{
"epoch": 1.771227289581016,
"grad_norm": 0.10347064405677986,
"learning_rate": 1.1823759089025219e-05,
"loss": 0.417,
"step": 3583
},
{
"epoch": 1.771721666048696,
"grad_norm": 0.11218605543530209,
"learning_rate": 1.181993036307775e-05,
"loss": 0.3688,
"step": 3584
},
{
"epoch": 1.7722160425163762,
"grad_norm": 0.10585861480471298,
"learning_rate": 1.181610136118515e-05,
"loss": 0.3989,
"step": 3585
},
{
"epoch": 1.7727104189840563,
"grad_norm": 0.1033491389220171,
"learning_rate": 1.1812272083927989e-05,
"loss": 0.4165,
"step": 3586
},
{
"epoch": 1.7732047954517365,
"grad_norm": 0.11015987527819131,
"learning_rate": 1.180844253188688e-05,
"loss": 0.4041,
"step": 3587
},
{
"epoch": 1.7736991719194166,
"grad_norm": 0.11303394646081233,
"learning_rate": 1.1804612705642476e-05,
"loss": 0.4042,
"step": 3588
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.10886590365152332,
"learning_rate": 1.1800782605775463e-05,
"loss": 0.4407,
"step": 3589
},
{
"epoch": 1.7746879248547769,
"grad_norm": 0.11328526675169771,
"learning_rate": 1.1796952232866584e-05,
"loss": 0.3809,
"step": 3590
},
{
"epoch": 1.775182301322457,
"grad_norm": 0.1101261446301266,
"learning_rate": 1.1793121587496612e-05,
"loss": 0.3798,
"step": 3591
},
{
"epoch": 1.7756766777901372,
"grad_norm": 0.10852575341349664,
"learning_rate": 1.1789290670246365e-05,
"loss": 0.4396,
"step": 3592
},
{
"epoch": 1.7761710542578173,
"grad_norm": 0.1205468978294275,
"learning_rate": 1.1785459481696704e-05,
"loss": 0.4041,
"step": 3593
},
{
"epoch": 1.7766654307254974,
"grad_norm": 0.10997392520562163,
"learning_rate": 1.1781628022428527e-05,
"loss": 0.3947,
"step": 3594
},
{
"epoch": 1.7771598071931776,
"grad_norm": 0.10584475138333241,
"learning_rate": 1.1777796293022774e-05,
"loss": 0.3927,
"step": 3595
},
{
"epoch": 1.7776541836608577,
"grad_norm": 0.10742179922264236,
"learning_rate": 1.1773964294060431e-05,
"loss": 0.4339,
"step": 3596
},
{
"epoch": 1.7781485601285378,
"grad_norm": 0.11098425354910339,
"learning_rate": 1.1770132026122518e-05,
"loss": 0.4076,
"step": 3597
},
{
"epoch": 1.778642936596218,
"grad_norm": 0.11104301586743287,
"learning_rate": 1.1766299489790098e-05,
"loss": 0.3963,
"step": 3598
},
{
"epoch": 1.779137313063898,
"grad_norm": 0.11512803917796643,
"learning_rate": 1.1762466685644278e-05,
"loss": 0.432,
"step": 3599
},
{
"epoch": 1.7796316895315782,
"grad_norm": 0.1613611850677917,
"learning_rate": 1.1758633614266206e-05,
"loss": 0.4382,
"step": 3600
},
{
"epoch": 1.7801260659992586,
"grad_norm": 0.7267573374357653,
"learning_rate": 1.1754800276237061e-05,
"loss": 0.4492,
"step": 3601
},
{
"epoch": 1.7806204424669385,
"grad_norm": 0.11937159404904096,
"learning_rate": 1.175096667213807e-05,
"loss": 0.3913,
"step": 3602
},
{
"epoch": 1.7811148189346189,
"grad_norm": 0.1254790415186598,
"learning_rate": 1.1747132802550504e-05,
"loss": 0.3965,
"step": 3603
},
{
"epoch": 1.7816091954022988,
"grad_norm": 0.1238659096799349,
"learning_rate": 1.174329866805567e-05,
"loss": 0.4102,
"step": 3604
},
{
"epoch": 1.7821035718699791,
"grad_norm": 0.12337570789102904,
"learning_rate": 1.1739464269234908e-05,
"loss": 0.4372,
"step": 3605
},
{
"epoch": 1.782597948337659,
"grad_norm": 0.13138899891021713,
"learning_rate": 1.1735629606669609e-05,
"loss": 0.427,
"step": 3606
},
{
"epoch": 1.7830923248053394,
"grad_norm": 0.11753528747130004,
"learning_rate": 1.1731794680941201e-05,
"loss": 0.4271,
"step": 3607
},
{
"epoch": 1.7835867012730193,
"grad_norm": 0.12189418708616899,
"learning_rate": 1.172795949263115e-05,
"loss": 0.4114,
"step": 3608
},
{
"epoch": 1.7840810777406997,
"grad_norm": 0.13888941971375815,
"learning_rate": 1.1724124042320958e-05,
"loss": 0.3787,
"step": 3609
},
{
"epoch": 1.7845754542083796,
"grad_norm": 0.11191073061993527,
"learning_rate": 1.172028833059218e-05,
"loss": 0.4161,
"step": 3610
},
{
"epoch": 1.78506983067606,
"grad_norm": 0.11999703389102359,
"learning_rate": 1.1716452358026396e-05,
"loss": 0.403,
"step": 3611
},
{
"epoch": 1.7855642071437399,
"grad_norm": 0.11929522720138058,
"learning_rate": 1.1712616125205235e-05,
"loss": 0.4078,
"step": 3612
},
{
"epoch": 1.7860585836114202,
"grad_norm": 0.11250858956574468,
"learning_rate": 1.1708779632710357e-05,
"loss": 0.4339,
"step": 3613
},
{
"epoch": 1.7865529600791001,
"grad_norm": 0.11818941190427658,
"learning_rate": 1.1704942881123469e-05,
"loss": 0.4131,
"step": 3614
},
{
"epoch": 1.7870473365467805,
"grad_norm": 0.11981471496755289,
"learning_rate": 1.1701105871026317e-05,
"loss": 0.4251,
"step": 3615
},
{
"epoch": 1.7875417130144604,
"grad_norm": 0.1527133007249802,
"learning_rate": 1.169726860300068e-05,
"loss": 0.3989,
"step": 3616
},
{
"epoch": 1.7880360894821408,
"grad_norm": 0.11148447612005873,
"learning_rate": 1.1693431077628383e-05,
"loss": 0.4327,
"step": 3617
},
{
"epoch": 1.7885304659498207,
"grad_norm": 0.12278905507033352,
"learning_rate": 1.1689593295491286e-05,
"loss": 0.4016,
"step": 3618
},
{
"epoch": 1.789024842417501,
"grad_norm": 0.1201122783406613,
"learning_rate": 1.1685755257171286e-05,
"loss": 0.4122,
"step": 3619
},
{
"epoch": 1.789519218885181,
"grad_norm": 0.19993399560657255,
"learning_rate": 1.1681916963250326e-05,
"loss": 0.4085,
"step": 3620
},
{
"epoch": 1.7900135953528613,
"grad_norm": 0.11909537670319184,
"learning_rate": 1.1678078414310382e-05,
"loss": 0.4048,
"step": 3621
},
{
"epoch": 1.7905079718205412,
"grad_norm": 0.11884069919696469,
"learning_rate": 1.167423961093347e-05,
"loss": 0.3795,
"step": 3622
},
{
"epoch": 1.7910023482882216,
"grad_norm": 0.3275607647813885,
"learning_rate": 1.1670400553701644e-05,
"loss": 0.4168,
"step": 3623
},
{
"epoch": 1.7914967247559015,
"grad_norm": 0.11914076392346624,
"learning_rate": 1.1666561243196997e-05,
"loss": 0.4377,
"step": 3624
},
{
"epoch": 1.7919911012235819,
"grad_norm": 0.12062010872201206,
"learning_rate": 1.1662721680001664e-05,
"loss": 0.4435,
"step": 3625
},
{
"epoch": 1.7924854776912618,
"grad_norm": 0.11351919863348836,
"learning_rate": 1.1658881864697808e-05,
"loss": 0.403,
"step": 3626
},
{
"epoch": 1.7929798541589421,
"grad_norm": 0.11713510941600028,
"learning_rate": 1.1655041797867645e-05,
"loss": 0.4307,
"step": 3627
},
{
"epoch": 1.793474230626622,
"grad_norm": 0.11294125544863909,
"learning_rate": 1.165120148009342e-05,
"loss": 0.4062,
"step": 3628
},
{
"epoch": 1.7939686070943024,
"grad_norm": 0.11730209755509212,
"learning_rate": 1.1647360911957413e-05,
"loss": 0.4319,
"step": 3629
},
{
"epoch": 1.7944629835619823,
"grad_norm": 0.11784984835896192,
"learning_rate": 1.1643520094041949e-05,
"loss": 0.4113,
"step": 3630
},
{
"epoch": 1.7949573600296627,
"grad_norm": 0.10776545607115945,
"learning_rate": 1.1639679026929387e-05,
"loss": 0.4145,
"step": 3631
},
{
"epoch": 1.7954517364973426,
"grad_norm": 0.11245056204302777,
"learning_rate": 1.1635837711202125e-05,
"loss": 0.4182,
"step": 3632
},
{
"epoch": 1.795946112965023,
"grad_norm": 0.15340448119991812,
"learning_rate": 1.1631996147442604e-05,
"loss": 0.3689,
"step": 3633
},
{
"epoch": 1.7964404894327028,
"grad_norm": 0.10822735628543058,
"learning_rate": 1.1628154336233288e-05,
"loss": 0.4036,
"step": 3634
},
{
"epoch": 1.7969348659003832,
"grad_norm": 0.10838538184544555,
"learning_rate": 1.1624312278156693e-05,
"loss": 0.4299,
"step": 3635
},
{
"epoch": 1.7974292423680631,
"grad_norm": 0.25179804189358473,
"learning_rate": 1.162046997379537e-05,
"loss": 0.4265,
"step": 3636
},
{
"epoch": 1.7979236188357435,
"grad_norm": 0.11006897723625056,
"learning_rate": 1.1616627423731898e-05,
"loss": 0.3952,
"step": 3637
},
{
"epoch": 1.7984179953034236,
"grad_norm": 0.10922986358228923,
"learning_rate": 1.1612784628548902e-05,
"loss": 0.4204,
"step": 3638
},
{
"epoch": 1.7989123717711037,
"grad_norm": 0.11182170420473969,
"learning_rate": 1.1608941588829045e-05,
"loss": 0.3934,
"step": 3639
},
{
"epoch": 1.7994067482387839,
"grad_norm": 0.10935139763156747,
"learning_rate": 1.1605098305155025e-05,
"loss": 0.4145,
"step": 3640
},
{
"epoch": 1.799901124706464,
"grad_norm": 0.11209800531047873,
"learning_rate": 1.1601254778109572e-05,
"loss": 0.3869,
"step": 3641
},
{
"epoch": 1.8003955011741442,
"grad_norm": 0.10985245839166795,
"learning_rate": 1.1597411008275456e-05,
"loss": 0.4219,
"step": 3642
},
{
"epoch": 1.8008898776418243,
"grad_norm": 0.11445099987437608,
"learning_rate": 1.1593566996235487e-05,
"loss": 0.4113,
"step": 3643
},
{
"epoch": 1.8013842541095044,
"grad_norm": 0.11404495808476939,
"learning_rate": 1.1589722742572513e-05,
"loss": 0.3796,
"step": 3644
},
{
"epoch": 1.8018786305771846,
"grad_norm": 0.1024170632596903,
"learning_rate": 1.1585878247869408e-05,
"loss": 0.4146,
"step": 3645
},
{
"epoch": 1.8023730070448647,
"grad_norm": 0.11492156295941242,
"learning_rate": 1.1582033512709096e-05,
"loss": 0.5794,
"step": 3646
},
{
"epoch": 1.8028673835125448,
"grad_norm": 0.8973640574882236,
"learning_rate": 1.1578188537674529e-05,
"loss": 0.4181,
"step": 3647
},
{
"epoch": 1.803361759980225,
"grad_norm": 0.12793567948917647,
"learning_rate": 1.1574343323348693e-05,
"loss": 0.411,
"step": 3648
},
{
"epoch": 1.803856136447905,
"grad_norm": 0.12539770053177487,
"learning_rate": 1.1570497870314622e-05,
"loss": 0.4118,
"step": 3649
},
{
"epoch": 1.8043505129155852,
"grad_norm": 0.11608997711299336,
"learning_rate": 1.1566652179155375e-05,
"loss": 0.3988,
"step": 3650
},
{
"epoch": 1.8048448893832654,
"grad_norm": 0.13402565250262286,
"learning_rate": 1.1562806250454051e-05,
"loss": 0.4537,
"step": 3651
},
{
"epoch": 1.8053392658509455,
"grad_norm": 0.14884530880314917,
"learning_rate": 1.1558960084793786e-05,
"loss": 0.3989,
"step": 3652
},
{
"epoch": 1.8058336423186256,
"grad_norm": 0.1301793854789664,
"learning_rate": 1.1555113682757754e-05,
"loss": 0.3918,
"step": 3653
},
{
"epoch": 1.8063280187863058,
"grad_norm": 0.13125830943593947,
"learning_rate": 1.1551267044929155e-05,
"loss": 0.4061,
"step": 3654
},
{
"epoch": 1.806822395253986,
"grad_norm": 0.12022245047107635,
"learning_rate": 1.1547420171891237e-05,
"loss": 0.401,
"step": 3655
},
{
"epoch": 1.807316771721666,
"grad_norm": 0.14193351004485735,
"learning_rate": 1.1543573064227278e-05,
"loss": 0.3809,
"step": 3656
},
{
"epoch": 1.8078111481893462,
"grad_norm": 0.11756142327938991,
"learning_rate": 1.1539725722520587e-05,
"loss": 0.4054,
"step": 3657
},
{
"epoch": 1.8083055246570263,
"grad_norm": 0.11769357147129136,
"learning_rate": 1.153587814735452e-05,
"loss": 0.3999,
"step": 3658
},
{
"epoch": 1.8087999011247065,
"grad_norm": 0.12258127564243379,
"learning_rate": 1.1532030339312459e-05,
"loss": 0.3983,
"step": 3659
},
{
"epoch": 1.8092942775923866,
"grad_norm": 0.11640454307819756,
"learning_rate": 1.1528182298977824e-05,
"loss": 0.3937,
"step": 3660
},
{
"epoch": 1.8097886540600667,
"grad_norm": 0.10958438195696417,
"learning_rate": 1.152433402693407e-05,
"loss": 0.3889,
"step": 3661
},
{
"epoch": 1.8102830305277469,
"grad_norm": 0.11695212608096629,
"learning_rate": 1.1520485523764686e-05,
"loss": 0.4123,
"step": 3662
},
{
"epoch": 1.810777406995427,
"grad_norm": 0.12071046451972758,
"learning_rate": 1.15166367900532e-05,
"loss": 0.3777,
"step": 3663
},
{
"epoch": 1.8112717834631071,
"grad_norm": 0.10651033392493578,
"learning_rate": 1.1512787826383172e-05,
"loss": 0.4014,
"step": 3664
},
{
"epoch": 1.8117661599307873,
"grad_norm": 0.11652695839781073,
"learning_rate": 1.1508938633338191e-05,
"loss": 0.3927,
"step": 3665
},
{
"epoch": 1.8122605363984674,
"grad_norm": 0.11454539156262401,
"learning_rate": 1.15050892115019e-05,
"loss": 0.4338,
"step": 3666
},
{
"epoch": 1.8127549128661475,
"grad_norm": 0.1638533740678505,
"learning_rate": 1.1501239561457955e-05,
"loss": 0.4258,
"step": 3667
},
{
"epoch": 1.8132492893338277,
"grad_norm": 0.10882757477776478,
"learning_rate": 1.1497389683790055e-05,
"loss": 0.4288,
"step": 3668
},
{
"epoch": 1.8137436658015078,
"grad_norm": 0.11406430339505004,
"learning_rate": 1.1493539579081938e-05,
"loss": 0.3949,
"step": 3669
},
{
"epoch": 1.814238042269188,
"grad_norm": 0.11660729084301895,
"learning_rate": 1.1489689247917368e-05,
"loss": 0.4049,
"step": 3670
},
{
"epoch": 1.814732418736868,
"grad_norm": 0.1106939930021246,
"learning_rate": 1.1485838690880148e-05,
"loss": 0.4073,
"step": 3671
},
{
"epoch": 1.8152267952045482,
"grad_norm": 0.11157891092413924,
"learning_rate": 1.148198790855412e-05,
"loss": 0.3941,
"step": 3672
},
{
"epoch": 1.8157211716722284,
"grad_norm": 0.11417925027232446,
"learning_rate": 1.147813690152315e-05,
"loss": 0.3892,
"step": 3673
},
{
"epoch": 1.8162155481399085,
"grad_norm": 0.5295939367964658,
"learning_rate": 1.1474285670371146e-05,
"loss": 0.4162,
"step": 3674
},
{
"epoch": 1.8167099246075886,
"grad_norm": 0.12392873947218744,
"learning_rate": 1.1470434215682045e-05,
"loss": 0.4091,
"step": 3675
},
{
"epoch": 1.817204301075269,
"grad_norm": 0.11834034647057698,
"learning_rate": 1.1466582538039821e-05,
"loss": 0.5242,
"step": 3676
},
{
"epoch": 1.817698677542949,
"grad_norm": 2.088854689621688,
"learning_rate": 1.1462730638028479e-05,
"loss": 0.4184,
"step": 3677
},
{
"epoch": 1.8181930540106293,
"grad_norm": 0.12538862521130334,
"learning_rate": 1.1458878516232061e-05,
"loss": 0.4071,
"step": 3678
},
{
"epoch": 1.8186874304783092,
"grad_norm": 0.1289472663308296,
"learning_rate": 1.1455026173234644e-05,
"loss": 0.3902,
"step": 3679
},
{
"epoch": 1.8191818069459895,
"grad_norm": 0.12196571585018912,
"learning_rate": 1.1451173609620331e-05,
"loss": 0.3739,
"step": 3680
},
{
"epoch": 1.8196761834136694,
"grad_norm": 0.1199903277867674,
"learning_rate": 1.1447320825973263e-05,
"loss": 0.4036,
"step": 3681
},
{
"epoch": 1.8201705598813498,
"grad_norm": 0.1256612713501704,
"learning_rate": 1.144346782287762e-05,
"loss": 0.4082,
"step": 3682
},
{
"epoch": 1.8206649363490297,
"grad_norm": 0.12831248461913541,
"learning_rate": 1.1439614600917604e-05,
"loss": 0.4064,
"step": 3683
},
{
"epoch": 1.82115931281671,
"grad_norm": 0.1228880736106697,
"learning_rate": 1.1435761160677457e-05,
"loss": 0.4177,
"step": 3684
},
{
"epoch": 1.82165368928439,
"grad_norm": 0.11312111895943244,
"learning_rate": 1.1431907502741455e-05,
"loss": 0.3941,
"step": 3685
},
{
"epoch": 1.8221480657520703,
"grad_norm": 0.12851822955490627,
"learning_rate": 1.1428053627693908e-05,
"loss": 0.415,
"step": 3686
},
{
"epoch": 1.8226424422197502,
"grad_norm": 0.11893737806170658,
"learning_rate": 1.1424199536119147e-05,
"loss": 0.4605,
"step": 3687
},
{
"epoch": 1.8231368186874306,
"grad_norm": 0.1988784380966453,
"learning_rate": 1.1420345228601553e-05,
"loss": 0.4208,
"step": 3688
},
{
"epoch": 1.8236311951551105,
"grad_norm": 0.11383426931591628,
"learning_rate": 1.141649070572553e-05,
"loss": 0.4167,
"step": 3689
},
{
"epoch": 1.8241255716227909,
"grad_norm": 0.11270210942641407,
"learning_rate": 1.141263596807551e-05,
"loss": 0.4148,
"step": 3690
},
{
"epoch": 1.8246199480904708,
"grad_norm": 0.10929896549774998,
"learning_rate": 1.140878101623597e-05,
"loss": 0.4298,
"step": 3691
},
{
"epoch": 1.8251143245581511,
"grad_norm": 0.11597587242483343,
"learning_rate": 1.1404925850791414e-05,
"loss": 0.4103,
"step": 3692
},
{
"epoch": 1.825608701025831,
"grad_norm": 0.1121610795344582,
"learning_rate": 1.1401070472326372e-05,
"loss": 0.4213,
"step": 3693
},
{
"epoch": 1.8261030774935114,
"grad_norm": 0.2969833763726523,
"learning_rate": 1.1397214881425417e-05,
"loss": 0.4041,
"step": 3694
},
{
"epoch": 1.8265974539611913,
"grad_norm": 0.11099180855630339,
"learning_rate": 1.1393359078673148e-05,
"loss": 0.3806,
"step": 3695
},
{
"epoch": 1.8270918304288717,
"grad_norm": 0.1054196126113011,
"learning_rate": 1.1389503064654194e-05,
"loss": 0.3881,
"step": 3696
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.11000001705842102,
"learning_rate": 1.1385646839953223e-05,
"loss": 0.429,
"step": 3697
},
{
"epoch": 1.828080583364232,
"grad_norm": 0.11513190842567664,
"learning_rate": 1.1381790405154933e-05,
"loss": 0.3986,
"step": 3698
},
{
"epoch": 1.8285749598319119,
"grad_norm": 0.10820095873681103,
"learning_rate": 1.1377933760844047e-05,
"loss": 0.3824,
"step": 3699
},
{
"epoch": 1.8290693362995922,
"grad_norm": 0.11273841508186586,
"learning_rate": 1.1374076907605326e-05,
"loss": 0.3992,
"step": 3700
},
{
"epoch": 1.8295637127672721,
"grad_norm": 0.10876611298396514,
"learning_rate": 1.1370219846023562e-05,
"loss": 0.4047,
"step": 3701
},
{
"epoch": 1.8300580892349525,
"grad_norm": 0.10979461848082893,
"learning_rate": 1.1366362576683583e-05,
"loss": 0.4023,
"step": 3702
},
{
"epoch": 1.8305524657026324,
"grad_norm": 0.10757366266709531,
"learning_rate": 1.1362505100170234e-05,
"loss": 0.3789,
"step": 3703
},
{
"epoch": 1.8310468421703128,
"grad_norm": 0.10430056051666226,
"learning_rate": 1.1358647417068408e-05,
"loss": 0.4132,
"step": 3704
},
{
"epoch": 1.8315412186379927,
"grad_norm": 0.10797710393779056,
"learning_rate": 1.1354789527963026e-05,
"loss": 0.3919,
"step": 3705
},
{
"epoch": 1.832035595105673,
"grad_norm": 0.12111678865187323,
"learning_rate": 1.1350931433439026e-05,
"loss": 0.3788,
"step": 3706
},
{
"epoch": 1.832529971573353,
"grad_norm": 0.10477347745981704,
"learning_rate": 1.1347073134081392e-05,
"loss": 0.405,
"step": 3707
},
{
"epoch": 1.8330243480410333,
"grad_norm": 0.10847572183091371,
"learning_rate": 1.1343214630475139e-05,
"loss": 0.3722,
"step": 3708
},
{
"epoch": 1.8335187245087132,
"grad_norm": 0.12223400233170903,
"learning_rate": 1.1339355923205304e-05,
"loss": 0.3894,
"step": 3709
},
{
"epoch": 1.8340131009763936,
"grad_norm": 0.10851551073863876,
"learning_rate": 1.1335497012856963e-05,
"loss": 0.4063,
"step": 3710
},
{
"epoch": 1.8345074774440735,
"grad_norm": 0.12124462617705101,
"learning_rate": 1.1331637900015215e-05,
"loss": 0.4069,
"step": 3711
},
{
"epoch": 1.8350018539117539,
"grad_norm": 0.13728882987560695,
"learning_rate": 1.13277785852652e-05,
"loss": 0.371,
"step": 3712
},
{
"epoch": 1.835496230379434,
"grad_norm": 0.1150011154064361,
"learning_rate": 1.1323919069192075e-05,
"loss": 0.4391,
"step": 3713
},
{
"epoch": 1.8359906068471141,
"grad_norm": 0.11198575586066792,
"learning_rate": 1.1320059352381044e-05,
"loss": 0.3994,
"step": 3714
},
{
"epoch": 1.8364849833147943,
"grad_norm": 0.11072826521011607,
"learning_rate": 1.1316199435417328e-05,
"loss": 0.4137,
"step": 3715
},
{
"epoch": 1.8369793597824744,
"grad_norm": 0.12486321999826938,
"learning_rate": 1.1312339318886183e-05,
"loss": 0.4064,
"step": 3716
},
{
"epoch": 1.8374737362501545,
"grad_norm": 0.11238424828263748,
"learning_rate": 1.1308479003372895e-05,
"loss": 0.417,
"step": 3717
},
{
"epoch": 1.8379681127178347,
"grad_norm": 0.12032079973897254,
"learning_rate": 1.1304618489462782e-05,
"loss": 0.3966,
"step": 3718
},
{
"epoch": 1.8384624891855148,
"grad_norm": 0.1070398345878948,
"learning_rate": 1.1300757777741191e-05,
"loss": 0.3854,
"step": 3719
},
{
"epoch": 1.838956865653195,
"grad_norm": 0.1207785544078471,
"learning_rate": 1.1296896868793494e-05,
"loss": 0.4128,
"step": 3720
},
{
"epoch": 1.839451242120875,
"grad_norm": 0.11110964689165295,
"learning_rate": 1.1293035763205108e-05,
"loss": 0.4101,
"step": 3721
},
{
"epoch": 1.8399456185885552,
"grad_norm": 0.10635955274850957,
"learning_rate": 1.1289174461561456e-05,
"loss": 0.4061,
"step": 3722
},
{
"epoch": 1.8404399950562353,
"grad_norm": 0.12392119308492107,
"learning_rate": 1.1285312964448014e-05,
"loss": 0.4058,
"step": 3723
},
{
"epoch": 1.8409343715239155,
"grad_norm": 0.10849380867221958,
"learning_rate": 1.1281451272450271e-05,
"loss": 0.3954,
"step": 3724
},
{
"epoch": 1.8414287479915956,
"grad_norm": 0.11060420702604355,
"learning_rate": 1.1277589386153757e-05,
"loss": 0.4135,
"step": 3725
},
{
"epoch": 1.8419231244592758,
"grad_norm": 0.10570530948883147,
"learning_rate": 1.1273727306144027e-05,
"loss": 0.408,
"step": 3726
},
{
"epoch": 1.8424175009269559,
"grad_norm": 0.11445658301448217,
"learning_rate": 1.1269865033006661e-05,
"loss": 0.3934,
"step": 3727
},
{
"epoch": 1.842911877394636,
"grad_norm": 0.104491040727478,
"learning_rate": 1.1266002567327275e-05,
"loss": 0.4036,
"step": 3728
},
{
"epoch": 1.8434062538623162,
"grad_norm": 0.10850871755369444,
"learning_rate": 1.126213990969151e-05,
"loss": 0.4064,
"step": 3729
},
{
"epoch": 1.8439006303299963,
"grad_norm": 0.10948643260744971,
"learning_rate": 1.125827706068504e-05,
"loss": 0.422,
"step": 3730
},
{
"epoch": 1.8443950067976764,
"grad_norm": 0.1048245539474238,
"learning_rate": 1.125441402089356e-05,
"loss": 0.4053,
"step": 3731
},
{
"epoch": 1.8448893832653566,
"grad_norm": 0.11101163896690228,
"learning_rate": 1.1250550790902808e-05,
"loss": 0.4019,
"step": 3732
},
{
"epoch": 1.8453837597330367,
"grad_norm": 0.10670302552517708,
"learning_rate": 1.1246687371298532e-05,
"loss": 0.3904,
"step": 3733
},
{
"epoch": 1.8458781362007168,
"grad_norm": 0.10004800690081833,
"learning_rate": 1.124282376266653e-05,
"loss": 0.3923,
"step": 3734
},
{
"epoch": 1.846372512668397,
"grad_norm": 0.10680057453762565,
"learning_rate": 1.1238959965592615e-05,
"loss": 0.4169,
"step": 3735
},
{
"epoch": 1.846866889136077,
"grad_norm": 0.11664039435721471,
"learning_rate": 1.1235095980662623e-05,
"loss": 0.4182,
"step": 3736
},
{
"epoch": 1.8473612656037572,
"grad_norm": 0.10711956823067245,
"learning_rate": 1.1231231808462438e-05,
"loss": 0.3855,
"step": 3737
},
{
"epoch": 1.8478556420714374,
"grad_norm": 0.10364859932353239,
"learning_rate": 1.1227367449577958e-05,
"loss": 0.412,
"step": 3738
},
{
"epoch": 1.8483500185391175,
"grad_norm": 0.10751427205281007,
"learning_rate": 1.1223502904595105e-05,
"loss": 0.4082,
"step": 3739
},
{
"epoch": 1.8488443950067976,
"grad_norm": 0.11309294054154315,
"learning_rate": 1.1219638174099846e-05,
"loss": 0.4125,
"step": 3740
},
{
"epoch": 1.8493387714744778,
"grad_norm": 0.10981665241465255,
"learning_rate": 1.1215773258678161e-05,
"loss": 0.4028,
"step": 3741
},
{
"epoch": 1.849833147942158,
"grad_norm": 0.11647657045664817,
"learning_rate": 1.1211908158916072e-05,
"loss": 0.441,
"step": 3742
},
{
"epoch": 1.850327524409838,
"grad_norm": 0.12779767216491394,
"learning_rate": 1.1208042875399611e-05,
"loss": 0.4044,
"step": 3743
},
{
"epoch": 1.8508219008775182,
"grad_norm": 0.11060586651490287,
"learning_rate": 1.1204177408714856e-05,
"loss": 0.3947,
"step": 3744
},
{
"epoch": 1.8513162773451983,
"grad_norm": 0.10960063419950361,
"learning_rate": 1.12003117594479e-05,
"loss": 0.3862,
"step": 3745
},
{
"epoch": 1.8518106538128785,
"grad_norm": 0.10828687379961212,
"learning_rate": 1.1196445928184866e-05,
"loss": 0.4037,
"step": 3746
},
{
"epoch": 1.8523050302805586,
"grad_norm": 0.11522593116322291,
"learning_rate": 1.119257991551191e-05,
"loss": 0.4212,
"step": 3747
},
{
"epoch": 1.8527994067482387,
"grad_norm": 0.11573319249635079,
"learning_rate": 1.1188713722015217e-05,
"loss": 0.4195,
"step": 3748
},
{
"epoch": 1.853293783215919,
"grad_norm": 0.10512118172511446,
"learning_rate": 1.1184847348280987e-05,
"loss": 0.4174,
"step": 3749
},
{
"epoch": 1.853788159683599,
"grad_norm": 0.10681982217004374,
"learning_rate": 1.1180980794895458e-05,
"loss": 0.384,
"step": 3750
},
{
"epoch": 1.8542825361512794,
"grad_norm": 0.10343768277788586,
"learning_rate": 1.1177114062444894e-05,
"loss": 0.3984,
"step": 3751
},
{
"epoch": 1.8547769126189593,
"grad_norm": 0.10925369268798643,
"learning_rate": 1.1173247151515578e-05,
"loss": 0.3957,
"step": 3752
},
{
"epoch": 1.8552712890866396,
"grad_norm": 0.10095720095360153,
"learning_rate": 1.1169380062693835e-05,
"loss": 0.3926,
"step": 3753
},
{
"epoch": 1.8557656655543195,
"grad_norm": 0.10837058944533413,
"learning_rate": 1.1165512796566006e-05,
"loss": 0.4002,
"step": 3754
},
{
"epoch": 1.856260042022,
"grad_norm": 0.10913358691069246,
"learning_rate": 1.1161645353718458e-05,
"loss": 0.4202,
"step": 3755
},
{
"epoch": 1.8567544184896798,
"grad_norm": 0.10634710031890034,
"learning_rate": 1.1157777734737589e-05,
"loss": 0.3989,
"step": 3756
},
{
"epoch": 1.8572487949573602,
"grad_norm": 0.11407075380540647,
"learning_rate": 1.1153909940209829e-05,
"loss": 0.3946,
"step": 3757
},
{
"epoch": 1.85774317142504,
"grad_norm": 0.10272311388482866,
"learning_rate": 1.1150041970721618e-05,
"loss": 0.3812,
"step": 3758
},
{
"epoch": 1.8582375478927204,
"grad_norm": 0.1058001056961963,
"learning_rate": 1.1146173826859443e-05,
"loss": 0.3891,
"step": 3759
},
{
"epoch": 1.8587319243604004,
"grad_norm": 0.10980437183427255,
"learning_rate": 1.1142305509209801e-05,
"loss": 0.4157,
"step": 3760
},
{
"epoch": 1.8592263008280807,
"grad_norm": 0.11112344927893898,
"learning_rate": 1.1138437018359225e-05,
"loss": 0.3967,
"step": 3761
},
{
"epoch": 1.8597206772957606,
"grad_norm": 0.10307983672677537,
"learning_rate": 1.1134568354894271e-05,
"loss": 0.4089,
"step": 3762
},
{
"epoch": 1.860215053763441,
"grad_norm": 0.10438125944967279,
"learning_rate": 1.1130699519401515e-05,
"loss": 0.3831,
"step": 3763
},
{
"epoch": 1.860709430231121,
"grad_norm": 0.10442469294715592,
"learning_rate": 1.112683051246758e-05,
"loss": 0.3907,
"step": 3764
},
{
"epoch": 1.8612038066988013,
"grad_norm": 0.10572928898587489,
"learning_rate": 1.1122961334679086e-05,
"loss": 0.4065,
"step": 3765
},
{
"epoch": 1.8616981831664812,
"grad_norm": 0.1096961559811813,
"learning_rate": 1.1119091986622695e-05,
"loss": 0.3969,
"step": 3766
},
{
"epoch": 1.8621925596341615,
"grad_norm": 0.10759350618637953,
"learning_rate": 1.1115222468885098e-05,
"loss": 0.4023,
"step": 3767
},
{
"epoch": 1.8626869361018414,
"grad_norm": 0.11014259062730221,
"learning_rate": 1.1111352782053008e-05,
"loss": 0.3961,
"step": 3768
},
{
"epoch": 1.8631813125695218,
"grad_norm": 0.11289109950829516,
"learning_rate": 1.1107482926713156e-05,
"loss": 0.4258,
"step": 3769
},
{
"epoch": 1.8636756890372017,
"grad_norm": 0.1118463676966669,
"learning_rate": 1.110361290345231e-05,
"loss": 0.3986,
"step": 3770
},
{
"epoch": 1.864170065504882,
"grad_norm": 0.10703854846148139,
"learning_rate": 1.109974271285726e-05,
"loss": 0.4116,
"step": 3771
},
{
"epoch": 1.864664441972562,
"grad_norm": 0.11080676478393228,
"learning_rate": 1.109587235551481e-05,
"loss": 0.3923,
"step": 3772
},
{
"epoch": 1.8651588184402423,
"grad_norm": 0.11157420013693141,
"learning_rate": 1.109200183201181e-05,
"loss": 0.4106,
"step": 3773
},
{
"epoch": 1.8656531949079223,
"grad_norm": 0.10511842546063528,
"learning_rate": 1.1088131142935119e-05,
"loss": 0.3872,
"step": 3774
},
{
"epoch": 1.8661475713756026,
"grad_norm": 0.11189810433644755,
"learning_rate": 1.1084260288871627e-05,
"loss": 0.3944,
"step": 3775
},
{
"epoch": 1.8666419478432825,
"grad_norm": 0.11146174043249911,
"learning_rate": 1.108038927040825e-05,
"loss": 0.3904,
"step": 3776
},
{
"epoch": 1.8671363243109629,
"grad_norm": 0.11156468685671271,
"learning_rate": 1.1076518088131924e-05,
"loss": 0.4084,
"step": 3777
},
{
"epoch": 1.8676307007786428,
"grad_norm": 0.10884865094979655,
"learning_rate": 1.1072646742629617e-05,
"loss": 0.3959,
"step": 3778
},
{
"epoch": 1.8681250772463232,
"grad_norm": 0.10485464784770998,
"learning_rate": 1.1068775234488317e-05,
"loss": 0.4154,
"step": 3779
},
{
"epoch": 1.868619453714003,
"grad_norm": 0.10710074623476334,
"learning_rate": 1.1064903564295033e-05,
"loss": 0.4064,
"step": 3780
},
{
"epoch": 1.8691138301816834,
"grad_norm": 0.11598722899256968,
"learning_rate": 1.1061031732636809e-05,
"loss": 0.4156,
"step": 3781
},
{
"epoch": 1.8696082066493633,
"grad_norm": 0.11483553417405816,
"learning_rate": 1.1057159740100705e-05,
"loss": 0.4079,
"step": 3782
},
{
"epoch": 1.8701025831170437,
"grad_norm": 0.11792287881862638,
"learning_rate": 1.1053287587273806e-05,
"loss": 0.3632,
"step": 3783
},
{
"epoch": 1.8705969595847236,
"grad_norm": 0.10378936635988766,
"learning_rate": 1.1049415274743231e-05,
"loss": 0.4273,
"step": 3784
},
{
"epoch": 1.871091336052404,
"grad_norm": 0.1046663881907454,
"learning_rate": 1.1045542803096106e-05,
"loss": 0.3897,
"step": 3785
},
{
"epoch": 1.871585712520084,
"grad_norm": 0.11556243139531705,
"learning_rate": 1.1041670172919597e-05,
"loss": 0.426,
"step": 3786
},
{
"epoch": 1.8720800889877642,
"grad_norm": 0.10866438477748262,
"learning_rate": 1.1037797384800886e-05,
"loss": 0.4131,
"step": 3787
},
{
"epoch": 1.8725744654554444,
"grad_norm": 0.1055247416680549,
"learning_rate": 1.1033924439327177e-05,
"loss": 0.3952,
"step": 3788
},
{
"epoch": 1.8730688419231245,
"grad_norm": 0.10618574709712816,
"learning_rate": 1.1030051337085708e-05,
"loss": 0.4104,
"step": 3789
},
{
"epoch": 1.8735632183908046,
"grad_norm": 0.111018811410703,
"learning_rate": 1.1026178078663729e-05,
"loss": 0.3783,
"step": 3790
},
{
"epoch": 1.8740575948584848,
"grad_norm": 0.1120409750654626,
"learning_rate": 1.1022304664648524e-05,
"loss": 0.4047,
"step": 3791
},
{
"epoch": 1.874551971326165,
"grad_norm": 0.11214941111402962,
"learning_rate": 1.1018431095627391e-05,
"loss": 0.3988,
"step": 3792
},
{
"epoch": 1.875046347793845,
"grad_norm": 0.11313808151592668,
"learning_rate": 1.1014557372187658e-05,
"loss": 0.435,
"step": 3793
},
{
"epoch": 1.8755407242615252,
"grad_norm": 0.11298238514160322,
"learning_rate": 1.1010683494916672e-05,
"loss": 0.399,
"step": 3794
},
{
"epoch": 1.8760351007292053,
"grad_norm": 0.13317842692197165,
"learning_rate": 1.1006809464401811e-05,
"loss": 0.39,
"step": 3795
},
{
"epoch": 1.8765294771968855,
"grad_norm": 0.10363949383290459,
"learning_rate": 1.1002935281230463e-05,
"loss": 0.3988,
"step": 3796
},
{
"epoch": 1.8770238536645656,
"grad_norm": 0.10342097496168651,
"learning_rate": 1.0999060945990057e-05,
"loss": 0.4393,
"step": 3797
},
{
"epoch": 1.8775182301322457,
"grad_norm": 0.11308695321826685,
"learning_rate": 1.0995186459268028e-05,
"loss": 0.4088,
"step": 3798
},
{
"epoch": 1.8780126065999259,
"grad_norm": 0.10723934967727317,
"learning_rate": 1.0991311821651842e-05,
"loss": 0.4139,
"step": 3799
},
{
"epoch": 1.878506983067606,
"grad_norm": 0.10744078043059271,
"learning_rate": 1.0987437033728991e-05,
"loss": 0.4201,
"step": 3800
},
{
"epoch": 1.8790013595352861,
"grad_norm": 0.1094139625790845,
"learning_rate": 1.0983562096086984e-05,
"loss": 0.3853,
"step": 3801
},
{
"epoch": 1.8794957360029663,
"grad_norm": 0.10778750962406472,
"learning_rate": 1.097968700931335e-05,
"loss": 0.4071,
"step": 3802
},
{
"epoch": 1.8799901124706464,
"grad_norm": 0.10297900006753605,
"learning_rate": 1.097581177399565e-05,
"loss": 0.4052,
"step": 3803
},
{
"epoch": 1.8804844889383265,
"grad_norm": 0.1041268629197392,
"learning_rate": 1.0971936390721465e-05,
"loss": 0.3906,
"step": 3804
},
{
"epoch": 1.8809788654060067,
"grad_norm": 0.1064625611875171,
"learning_rate": 1.096806086007839e-05,
"loss": 0.396,
"step": 3805
},
{
"epoch": 1.8814732418736868,
"grad_norm": 0.10762302172520895,
"learning_rate": 1.0964185182654052e-05,
"loss": 0.4124,
"step": 3806
},
{
"epoch": 1.881967618341367,
"grad_norm": 0.10591661094327949,
"learning_rate": 1.0960309359036096e-05,
"loss": 0.3838,
"step": 3807
},
{
"epoch": 1.882461994809047,
"grad_norm": 0.10463391772373815,
"learning_rate": 1.0956433389812192e-05,
"loss": 0.3918,
"step": 3808
},
{
"epoch": 1.8829563712767272,
"grad_norm": 0.1086175220672744,
"learning_rate": 1.0952557275570026e-05,
"loss": 0.3824,
"step": 3809
},
{
"epoch": 1.8834507477444073,
"grad_norm": 0.1144278516875633,
"learning_rate": 1.0948681016897312e-05,
"loss": 0.4234,
"step": 3810
},
{
"epoch": 1.8839451242120875,
"grad_norm": 0.10750963379137272,
"learning_rate": 1.0944804614381784e-05,
"loss": 0.4089,
"step": 3811
},
{
"epoch": 1.8844395006797676,
"grad_norm": 0.10710097812223436,
"learning_rate": 1.0940928068611199e-05,
"loss": 0.424,
"step": 3812
},
{
"epoch": 1.8849338771474478,
"grad_norm": 0.11474222508595967,
"learning_rate": 1.0937051380173328e-05,
"loss": 0.4106,
"step": 3813
},
{
"epoch": 1.885428253615128,
"grad_norm": 0.11171925998322053,
"learning_rate": 1.0933174549655981e-05,
"loss": 0.4155,
"step": 3814
},
{
"epoch": 1.885922630082808,
"grad_norm": 0.10814263434502805,
"learning_rate": 1.0929297577646967e-05,
"loss": 0.3992,
"step": 3815
},
{
"epoch": 1.8864170065504882,
"grad_norm": 0.1146582654875854,
"learning_rate": 1.0925420464734135e-05,
"loss": 0.3909,
"step": 3816
},
{
"epoch": 1.8869113830181683,
"grad_norm": 0.10887174054362592,
"learning_rate": 1.092154321150535e-05,
"loss": 0.3987,
"step": 3817
},
{
"epoch": 1.8874057594858484,
"grad_norm": 0.10550694042775648,
"learning_rate": 1.0917665818548491e-05,
"loss": 0.4201,
"step": 3818
},
{
"epoch": 1.8879001359535286,
"grad_norm": 0.11980855035949345,
"learning_rate": 1.0913788286451465e-05,
"loss": 0.3861,
"step": 3819
},
{
"epoch": 1.8883945124212087,
"grad_norm": 0.1032510805052488,
"learning_rate": 1.0909910615802207e-05,
"loss": 0.3857,
"step": 3820
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.1158391546748963,
"learning_rate": 1.0906032807188649e-05,
"loss": 0.4146,
"step": 3821
},
{
"epoch": 1.889383265356569,
"grad_norm": 0.11054837851656375,
"learning_rate": 1.0902154861198775e-05,
"loss": 0.3929,
"step": 3822
},
{
"epoch": 1.889877641824249,
"grad_norm": 0.10848478520486698,
"learning_rate": 1.0898276778420566e-05,
"loss": 0.3857,
"step": 3823
},
{
"epoch": 1.8903720182919295,
"grad_norm": 0.10190023502366279,
"learning_rate": 1.0894398559442036e-05,
"loss": 0.4121,
"step": 3824
},
{
"epoch": 1.8908663947596094,
"grad_norm": 0.10755555200087029,
"learning_rate": 1.0890520204851217e-05,
"loss": 0.4012,
"step": 3825
},
{
"epoch": 1.8913607712272897,
"grad_norm": 0.11289610452987453,
"learning_rate": 1.0886641715236158e-05,
"loss": 0.4072,
"step": 3826
},
{
"epoch": 1.8918551476949697,
"grad_norm": 0.10526232237765835,
"learning_rate": 1.0882763091184932e-05,
"loss": 0.4022,
"step": 3827
},
{
"epoch": 1.89234952416265,
"grad_norm": 0.10473206112471616,
"learning_rate": 1.0878884333285631e-05,
"loss": 0.3639,
"step": 3828
},
{
"epoch": 1.89284390063033,
"grad_norm": 0.10630431318164896,
"learning_rate": 1.0875005442126366e-05,
"loss": 0.4088,
"step": 3829
},
{
"epoch": 1.8933382770980103,
"grad_norm": 0.10588764613750841,
"learning_rate": 1.0871126418295277e-05,
"loss": 0.4292,
"step": 3830
},
{
"epoch": 1.8938326535656902,
"grad_norm": 0.10979067712452763,
"learning_rate": 1.0867247262380512e-05,
"loss": 0.4007,
"step": 3831
},
{
"epoch": 1.8943270300333706,
"grad_norm": 0.10846447805298594,
"learning_rate": 1.086336797497024e-05,
"loss": 0.4064,
"step": 3832
},
{
"epoch": 1.8948214065010505,
"grad_norm": 0.11075556889148887,
"learning_rate": 1.0859488556652664e-05,
"loss": 0.4013,
"step": 3833
},
{
"epoch": 1.8953157829687308,
"grad_norm": 0.10471608672061677,
"learning_rate": 1.0855609008015989e-05,
"loss": 0.3921,
"step": 3834
},
{
"epoch": 1.8958101594364107,
"grad_norm": 0.10545552166632541,
"learning_rate": 1.0851729329648451e-05,
"loss": 0.419,
"step": 3835
},
{
"epoch": 1.896304535904091,
"grad_norm": 0.10878859548945875,
"learning_rate": 1.08478495221383e-05,
"loss": 0.4072,
"step": 3836
},
{
"epoch": 1.896798912371771,
"grad_norm": 0.10522018952521801,
"learning_rate": 1.0843969586073812e-05,
"loss": 0.405,
"step": 3837
},
{
"epoch": 1.8972932888394514,
"grad_norm": 0.10564555458027848,
"learning_rate": 1.0840089522043278e-05,
"loss": 0.434,
"step": 3838
},
{
"epoch": 1.8977876653071313,
"grad_norm": 0.31765654069627675,
"learning_rate": 1.0836209330635004e-05,
"loss": 0.398,
"step": 3839
},
{
"epoch": 1.8982820417748116,
"grad_norm": 0.11407201068438616,
"learning_rate": 1.0832329012437324e-05,
"loss": 0.4121,
"step": 3840
},
{
"epoch": 1.8987764182424915,
"grad_norm": 0.10551062957634381,
"learning_rate": 1.0828448568038588e-05,
"loss": 0.4064,
"step": 3841
},
{
"epoch": 1.899270794710172,
"grad_norm": 0.10954203336940158,
"learning_rate": 1.0824567998027162e-05,
"loss": 0.3948,
"step": 3842
},
{
"epoch": 1.8997651711778518,
"grad_norm": 0.11091056883601011,
"learning_rate": 1.0820687302991435e-05,
"loss": 0.396,
"step": 3843
},
{
"epoch": 1.9002595476455322,
"grad_norm": 0.10387050868684537,
"learning_rate": 1.081680648351981e-05,
"loss": 0.4008,
"step": 3844
},
{
"epoch": 1.900753924113212,
"grad_norm": 0.12822285668936959,
"learning_rate": 1.0812925540200718e-05,
"loss": 0.4192,
"step": 3845
},
{
"epoch": 1.9012483005808924,
"grad_norm": 0.1134623634686496,
"learning_rate": 1.0809044473622602e-05,
"loss": 0.4059,
"step": 3846
},
{
"epoch": 1.9017426770485724,
"grad_norm": 0.10974314380634043,
"learning_rate": 1.0805163284373922e-05,
"loss": 0.4369,
"step": 3847
},
{
"epoch": 1.9022370535162527,
"grad_norm": 0.11082339691095333,
"learning_rate": 1.0801281973043161e-05,
"loss": 0.4055,
"step": 3848
},
{
"epoch": 1.9027314299839326,
"grad_norm": 0.1123230809198496,
"learning_rate": 1.0797400540218819e-05,
"loss": 0.4224,
"step": 3849
},
{
"epoch": 1.903225806451613,
"grad_norm": 0.11339899222427235,
"learning_rate": 1.0793518986489417e-05,
"loss": 0.4152,
"step": 3850
},
{
"epoch": 1.903720182919293,
"grad_norm": 0.12741100103209183,
"learning_rate": 1.0789637312443483e-05,
"loss": 0.4163,
"step": 3851
},
{
"epoch": 1.9042145593869733,
"grad_norm": 0.1112719685841783,
"learning_rate": 1.0785755518669583e-05,
"loss": 0.3874,
"step": 3852
},
{
"epoch": 1.9047089358546532,
"grad_norm": 0.11203089586823516,
"learning_rate": 1.0781873605756289e-05,
"loss": 0.4042,
"step": 3853
},
{
"epoch": 1.9052033123223335,
"grad_norm": 0.10983977844316235,
"learning_rate": 1.077799157429218e-05,
"loss": 0.3943,
"step": 3854
},
{
"epoch": 1.9056976887900134,
"grad_norm": 0.10813564800447094,
"learning_rate": 1.0774109424865875e-05,
"loss": 0.3973,
"step": 3855
},
{
"epoch": 1.9061920652576938,
"grad_norm": 0.1051074921852994,
"learning_rate": 1.0770227158066002e-05,
"loss": 0.3918,
"step": 3856
},
{
"epoch": 1.9066864417253737,
"grad_norm": 0.10942166204906992,
"learning_rate": 1.0766344774481203e-05,
"loss": 0.3998,
"step": 3857
},
{
"epoch": 1.907180818193054,
"grad_norm": 0.10883431083378389,
"learning_rate": 1.076246227470014e-05,
"loss": 0.4062,
"step": 3858
},
{
"epoch": 1.907675194660734,
"grad_norm": 0.10698857966678911,
"learning_rate": 1.0758579659311496e-05,
"loss": 0.4078,
"step": 3859
},
{
"epoch": 1.9081695711284143,
"grad_norm": 0.10899143880805684,
"learning_rate": 1.0754696928903965e-05,
"loss": 0.4182,
"step": 3860
},
{
"epoch": 1.9086639475960945,
"grad_norm": 0.11177718702622223,
"learning_rate": 1.0750814084066265e-05,
"loss": 0.4189,
"step": 3861
},
{
"epoch": 1.9091583240637746,
"grad_norm": 0.11397439982460808,
"learning_rate": 1.0746931125387128e-05,
"loss": 0.4141,
"step": 3862
},
{
"epoch": 1.9096527005314547,
"grad_norm": 0.1113841825567561,
"learning_rate": 1.07430480534553e-05,
"loss": 0.4001,
"step": 3863
},
{
"epoch": 1.9101470769991349,
"grad_norm": 0.10953868912459698,
"learning_rate": 1.0739164868859555e-05,
"loss": 0.3979,
"step": 3864
},
{
"epoch": 1.910641453466815,
"grad_norm": 0.10243914356420745,
"learning_rate": 1.0735281572188667e-05,
"loss": 0.3695,
"step": 3865
},
{
"epoch": 1.9111358299344952,
"grad_norm": 0.12146629077940715,
"learning_rate": 1.073139816403145e-05,
"loss": 0.4026,
"step": 3866
},
{
"epoch": 1.9116302064021753,
"grad_norm": 0.10900375058378868,
"learning_rate": 1.072751464497671e-05,
"loss": 0.3856,
"step": 3867
},
{
"epoch": 1.9121245828698554,
"grad_norm": 0.10974381574983805,
"learning_rate": 1.0723631015613289e-05,
"loss": 0.3807,
"step": 3868
},
{
"epoch": 1.9126189593375356,
"grad_norm": 0.11116834327536225,
"learning_rate": 1.0719747276530037e-05,
"loss": 0.4171,
"step": 3869
},
{
"epoch": 1.9131133358052157,
"grad_norm": 0.10817158235049656,
"learning_rate": 1.0715863428315819e-05,
"loss": 0.3916,
"step": 3870
},
{
"epoch": 1.9136077122728958,
"grad_norm": 0.11276609025302718,
"learning_rate": 1.0711979471559521e-05,
"loss": 0.3737,
"step": 3871
},
{
"epoch": 1.914102088740576,
"grad_norm": 0.10865092718245542,
"learning_rate": 1.0708095406850048e-05,
"loss": 0.4191,
"step": 3872
},
{
"epoch": 1.914596465208256,
"grad_norm": 0.10898812141936884,
"learning_rate": 1.0704211234776311e-05,
"loss": 0.4129,
"step": 3873
},
{
"epoch": 1.9150908416759362,
"grad_norm": 0.11042422489155362,
"learning_rate": 1.070032695592725e-05,
"loss": 0.4377,
"step": 3874
},
{
"epoch": 1.9155852181436164,
"grad_norm": 0.1124237029049148,
"learning_rate": 1.069644257089181e-05,
"loss": 0.4203,
"step": 3875
},
{
"epoch": 1.9160795946112965,
"grad_norm": 0.1117511319870787,
"learning_rate": 1.0692558080258959e-05,
"loss": 0.4104,
"step": 3876
},
{
"epoch": 1.9165739710789766,
"grad_norm": 0.11002347637207396,
"learning_rate": 1.0688673484617679e-05,
"loss": 0.3944,
"step": 3877
},
{
"epoch": 1.9170683475466568,
"grad_norm": 0.11404278549580947,
"learning_rate": 1.0684788784556965e-05,
"loss": 0.4002,
"step": 3878
},
{
"epoch": 1.917562724014337,
"grad_norm": 0.10534727912618996,
"learning_rate": 1.068090398066584e-05,
"loss": 0.4113,
"step": 3879
},
{
"epoch": 1.918057100482017,
"grad_norm": 0.1072891907577018,
"learning_rate": 1.0677019073533324e-05,
"loss": 0.3986,
"step": 3880
},
{
"epoch": 1.9185514769496972,
"grad_norm": 0.11895272820142998,
"learning_rate": 1.0673134063748463e-05,
"loss": 0.4028,
"step": 3881
},
{
"epoch": 1.9190458534173773,
"grad_norm": 0.10895984883490199,
"learning_rate": 1.0669248951900326e-05,
"loss": 0.404,
"step": 3882
},
{
"epoch": 1.9195402298850575,
"grad_norm": 0.11495140477077372,
"learning_rate": 1.0665363738577978e-05,
"loss": 0.4063,
"step": 3883
},
{
"epoch": 1.9200346063527376,
"grad_norm": 0.11259979263975624,
"learning_rate": 1.0661478424370518e-05,
"loss": 0.3776,
"step": 3884
},
{
"epoch": 1.9205289828204177,
"grad_norm": 0.10408693591022089,
"learning_rate": 1.0657593009867048e-05,
"loss": 0.4051,
"step": 3885
},
{
"epoch": 1.9210233592880979,
"grad_norm": 0.10615462249197374,
"learning_rate": 1.0653707495656696e-05,
"loss": 0.3922,
"step": 3886
},
{
"epoch": 1.921517735755778,
"grad_norm": 0.11488258296579884,
"learning_rate": 1.0649821882328595e-05,
"loss": 0.427,
"step": 3887
},
{
"epoch": 1.9220121122234581,
"grad_norm": 0.11000967069320551,
"learning_rate": 1.06459361704719e-05,
"loss": 0.3912,
"step": 3888
},
{
"epoch": 1.9225064886911383,
"grad_norm": 0.10997165735956228,
"learning_rate": 1.0642050360675776e-05,
"loss": 0.4198,
"step": 3889
},
{
"epoch": 1.9230008651588184,
"grad_norm": 0.30901399623067044,
"learning_rate": 1.0638164453529403e-05,
"loss": 0.4119,
"step": 3890
},
{
"epoch": 1.9234952416264985,
"grad_norm": 0.10613900733151432,
"learning_rate": 1.0634278449621982e-05,
"loss": 0.4186,
"step": 3891
},
{
"epoch": 1.9239896180941787,
"grad_norm": 0.11295367084937033,
"learning_rate": 1.0630392349542724e-05,
"loss": 0.4063,
"step": 3892
},
{
"epoch": 1.9244839945618588,
"grad_norm": 0.11099858315288083,
"learning_rate": 1.0626506153880854e-05,
"loss": 0.3705,
"step": 3893
},
{
"epoch": 1.924978371029539,
"grad_norm": 0.10181025147999512,
"learning_rate": 1.062261986322561e-05,
"loss": 0.417,
"step": 3894
},
{
"epoch": 1.925472747497219,
"grad_norm": 0.11359012635645838,
"learning_rate": 1.0618733478166252e-05,
"loss": 0.3896,
"step": 3895
},
{
"epoch": 1.9259671239648992,
"grad_norm": 0.11438975847670109,
"learning_rate": 1.0614846999292045e-05,
"loss": 0.4008,
"step": 3896
},
{
"epoch": 1.9264615004325794,
"grad_norm": 0.10589557475916009,
"learning_rate": 1.0610960427192273e-05,
"loss": 0.3803,
"step": 3897
},
{
"epoch": 1.9269558769002595,
"grad_norm": 0.10334843046228466,
"learning_rate": 1.0607073762456236e-05,
"loss": 0.4185,
"step": 3898
},
{
"epoch": 1.9274502533679398,
"grad_norm": 0.10847587850216407,
"learning_rate": 1.0603187005673247e-05,
"loss": 0.3808,
"step": 3899
},
{
"epoch": 1.9279446298356198,
"grad_norm": 0.10790570191793458,
"learning_rate": 1.0599300157432625e-05,
"loss": 0.407,
"step": 3900
},
{
"epoch": 1.9284390063033001,
"grad_norm": 0.11261012987333682,
"learning_rate": 1.0595413218323715e-05,
"loss": 0.4577,
"step": 3901
},
{
"epoch": 1.92893338277098,
"grad_norm": 0.11221988432128124,
"learning_rate": 1.0591526188935872e-05,
"loss": 0.4147,
"step": 3902
},
{
"epoch": 1.9294277592386604,
"grad_norm": 0.10842107060756034,
"learning_rate": 1.0587639069858458e-05,
"loss": 0.3757,
"step": 3903
},
{
"epoch": 1.9299221357063403,
"grad_norm": 0.10661558867324389,
"learning_rate": 1.0583751861680854e-05,
"loss": 0.4087,
"step": 3904
},
{
"epoch": 1.9304165121740207,
"grad_norm": 0.1076229341981053,
"learning_rate": 1.0579864564992455e-05,
"loss": 0.4204,
"step": 3905
},
{
"epoch": 1.9309108886417006,
"grad_norm": 0.5579352967908765,
"learning_rate": 1.057597718038267e-05,
"loss": 0.4715,
"step": 3906
},
{
"epoch": 1.931405265109381,
"grad_norm": 0.11269454305006818,
"learning_rate": 1.057208970844092e-05,
"loss": 0.3867,
"step": 3907
},
{
"epoch": 1.9318996415770608,
"grad_norm": 0.10647575984953593,
"learning_rate": 1.0568202149756637e-05,
"loss": 0.3988,
"step": 3908
},
{
"epoch": 1.9323940180447412,
"grad_norm": 0.11251248871244157,
"learning_rate": 1.0564314504919269e-05,
"loss": 0.4141,
"step": 3909
},
{
"epoch": 1.9328883945124211,
"grad_norm": 0.11762761514834852,
"learning_rate": 1.0560426774518275e-05,
"loss": 0.3999,
"step": 3910
},
{
"epoch": 1.9333827709801015,
"grad_norm": 0.12415703886856068,
"learning_rate": 1.0556538959143128e-05,
"loss": 0.4124,
"step": 3911
},
{
"epoch": 1.9338771474477814,
"grad_norm": 0.10193330617308884,
"learning_rate": 1.055265105938332e-05,
"loss": 0.3977,
"step": 3912
},
{
"epoch": 1.9343715239154617,
"grad_norm": 0.11169295645280798,
"learning_rate": 1.0548763075828346e-05,
"loss": 0.3939,
"step": 3913
},
{
"epoch": 1.9348659003831417,
"grad_norm": 0.19426415091130025,
"learning_rate": 1.0544875009067713e-05,
"loss": 0.4607,
"step": 3914
},
{
"epoch": 1.935360276850822,
"grad_norm": 0.11177601447513956,
"learning_rate": 1.0540986859690953e-05,
"loss": 0.4217,
"step": 3915
},
{
"epoch": 1.935854653318502,
"grad_norm": 0.10482106274706457,
"learning_rate": 1.0537098628287596e-05,
"loss": 0.4149,
"step": 3916
},
{
"epoch": 1.9363490297861823,
"grad_norm": 0.1103491295320594,
"learning_rate": 1.0533210315447196e-05,
"loss": 0.4264,
"step": 3917
},
{
"epoch": 1.9368434062538622,
"grad_norm": 0.11566439128028232,
"learning_rate": 1.0529321921759312e-05,
"loss": 0.3828,
"step": 3918
},
{
"epoch": 1.9373377827215426,
"grad_norm": 0.10622840813435865,
"learning_rate": 1.0525433447813522e-05,
"loss": 0.3714,
"step": 3919
},
{
"epoch": 1.9378321591892225,
"grad_norm": 0.10388296235649852,
"learning_rate": 1.0521544894199407e-05,
"loss": 0.3793,
"step": 3920
},
{
"epoch": 1.9383265356569028,
"grad_norm": 0.10781297820717883,
"learning_rate": 1.0517656261506566e-05,
"loss": 0.4059,
"step": 3921
},
{
"epoch": 1.9388209121245827,
"grad_norm": 0.10466072045637749,
"learning_rate": 1.0513767550324611e-05,
"loss": 0.4103,
"step": 3922
},
{
"epoch": 1.939315288592263,
"grad_norm": 0.10816536457719722,
"learning_rate": 1.0509878761243164e-05,
"loss": 0.4154,
"step": 3923
},
{
"epoch": 1.939809665059943,
"grad_norm": 0.10658032669504117,
"learning_rate": 1.0505989894851856e-05,
"loss": 0.3969,
"step": 3924
},
{
"epoch": 1.9403040415276234,
"grad_norm": 0.1166064202409093,
"learning_rate": 1.0502100951740335e-05,
"loss": 0.3963,
"step": 3925
},
{
"epoch": 1.9407984179953033,
"grad_norm": 0.10592267507936017,
"learning_rate": 1.0498211932498257e-05,
"loss": 0.3773,
"step": 3926
},
{
"epoch": 1.9412927944629836,
"grad_norm": 0.11190066336746038,
"learning_rate": 1.0494322837715289e-05,
"loss": 0.4003,
"step": 3927
},
{
"epoch": 1.9417871709306636,
"grad_norm": 0.10913787046731502,
"learning_rate": 1.0490433667981116e-05,
"loss": 0.4326,
"step": 3928
},
{
"epoch": 1.942281547398344,
"grad_norm": 0.5109771615394056,
"learning_rate": 1.0486544423885428e-05,
"loss": 0.4151,
"step": 3929
},
{
"epoch": 1.9427759238660238,
"grad_norm": 0.11516779562829245,
"learning_rate": 1.0482655106017922e-05,
"loss": 0.4006,
"step": 3930
},
{
"epoch": 1.9432703003337042,
"grad_norm": 0.10616956200112959,
"learning_rate": 1.0478765714968318e-05,
"loss": 0.3743,
"step": 3931
},
{
"epoch": 1.943764676801384,
"grad_norm": 0.10795429216465177,
"learning_rate": 1.0474876251326342e-05,
"loss": 0.3815,
"step": 3932
},
{
"epoch": 1.9442590532690645,
"grad_norm": 0.1132967234984529,
"learning_rate": 1.0470986715681724e-05,
"loss": 0.4094,
"step": 3933
},
{
"epoch": 1.9447534297367444,
"grad_norm": 0.1145728195175198,
"learning_rate": 1.0467097108624215e-05,
"loss": 0.3931,
"step": 3934
},
{
"epoch": 1.9452478062044247,
"grad_norm": 0.11182905164592077,
"learning_rate": 1.0463207430743576e-05,
"loss": 0.3939,
"step": 3935
},
{
"epoch": 1.9457421826721049,
"grad_norm": 0.11234739475263976,
"learning_rate": 1.0459317682629566e-05,
"loss": 0.3977,
"step": 3936
},
{
"epoch": 1.946236559139785,
"grad_norm": 0.1097564684168457,
"learning_rate": 1.0455427864871971e-05,
"loss": 0.3755,
"step": 3937
},
{
"epoch": 1.9467309356074651,
"grad_norm": 0.1155411331778849,
"learning_rate": 1.0451537978060582e-05,
"loss": 0.4122,
"step": 3938
},
{
"epoch": 1.9472253120751453,
"grad_norm": 0.1064199157158972,
"learning_rate": 1.0447648022785197e-05,
"loss": 0.4075,
"step": 3939
},
{
"epoch": 1.9477196885428254,
"grad_norm": 0.1153535751298826,
"learning_rate": 1.0443757999635625e-05,
"loss": 0.4137,
"step": 3940
},
{
"epoch": 1.9482140650105055,
"grad_norm": 0.11921642741589102,
"learning_rate": 1.0439867909201689e-05,
"loss": 0.4035,
"step": 3941
},
{
"epoch": 1.9487084414781857,
"grad_norm": 0.11246840585057771,
"learning_rate": 1.0435977752073219e-05,
"loss": 0.3922,
"step": 3942
},
{
"epoch": 1.9492028179458658,
"grad_norm": 0.10425928370817919,
"learning_rate": 1.0432087528840056e-05,
"loss": 0.3814,
"step": 3943
},
{
"epoch": 1.949697194413546,
"grad_norm": 0.10404054143514244,
"learning_rate": 1.0428197240092053e-05,
"loss": 0.4005,
"step": 3944
},
{
"epoch": 1.950191570881226,
"grad_norm": 0.11051609798867781,
"learning_rate": 1.0424306886419069e-05,
"loss": 0.3845,
"step": 3945
},
{
"epoch": 1.9506859473489062,
"grad_norm": 0.11158046401096307,
"learning_rate": 1.0420416468410976e-05,
"loss": 0.4171,
"step": 3946
},
{
"epoch": 1.9511803238165863,
"grad_norm": 0.11171958819507066,
"learning_rate": 1.0416525986657654e-05,
"loss": 0.3853,
"step": 3947
},
{
"epoch": 1.9516747002842665,
"grad_norm": 0.10324369262721006,
"learning_rate": 1.0412635441748997e-05,
"loss": 0.4179,
"step": 3948
},
{
"epoch": 1.9521690767519466,
"grad_norm": 0.12319021958138587,
"learning_rate": 1.04087448342749e-05,
"loss": 0.3982,
"step": 3949
},
{
"epoch": 1.9526634532196268,
"grad_norm": 0.10833081886899032,
"learning_rate": 1.0404854164825275e-05,
"loss": 0.3993,
"step": 3950
},
{
"epoch": 1.953157829687307,
"grad_norm": 0.11627747696690191,
"learning_rate": 1.0400963433990044e-05,
"loss": 0.4325,
"step": 3951
},
{
"epoch": 1.953652206154987,
"grad_norm": 0.11470118467139472,
"learning_rate": 1.0397072642359125e-05,
"loss": 0.4111,
"step": 3952
},
{
"epoch": 1.9541465826226672,
"grad_norm": 0.10577520169571816,
"learning_rate": 1.0393181790522467e-05,
"loss": 0.4029,
"step": 3953
},
{
"epoch": 1.9546409590903473,
"grad_norm": 0.10596784789822539,
"learning_rate": 1.0389290879070008e-05,
"loss": 0.3892,
"step": 3954
},
{
"epoch": 1.9551353355580274,
"grad_norm": 0.1153886894834114,
"learning_rate": 1.0385399908591712e-05,
"loss": 0.4148,
"step": 3955
},
{
"epoch": 1.9556297120257076,
"grad_norm": 0.10886109207079453,
"learning_rate": 1.0381508879677535e-05,
"loss": 0.4209,
"step": 3956
},
{
"epoch": 1.9561240884933877,
"grad_norm": 0.11370363929850384,
"learning_rate": 1.0377617792917456e-05,
"loss": 0.4242,
"step": 3957
},
{
"epoch": 1.9566184649610678,
"grad_norm": 0.10710345291235357,
"learning_rate": 1.0373726648901454e-05,
"loss": 0.3831,
"step": 3958
},
{
"epoch": 1.957112841428748,
"grad_norm": 0.11210683830590112,
"learning_rate": 1.0369835448219521e-05,
"loss": 0.4164,
"step": 3959
},
{
"epoch": 1.957607217896428,
"grad_norm": 0.17512607072330683,
"learning_rate": 1.0365944191461656e-05,
"loss": 0.3975,
"step": 3960
},
{
"epoch": 1.9581015943641082,
"grad_norm": 0.15141802138272406,
"learning_rate": 1.036205287921787e-05,
"loss": 0.3989,
"step": 3961
},
{
"epoch": 1.9585959708317884,
"grad_norm": 0.10476547388702767,
"learning_rate": 1.0358161512078178e-05,
"loss": 0.3733,
"step": 3962
},
{
"epoch": 1.9590903472994685,
"grad_norm": 0.10775337143174651,
"learning_rate": 1.0354270090632596e-05,
"loss": 0.4091,
"step": 3963
},
{
"epoch": 1.9595847237671487,
"grad_norm": 0.10473845144115929,
"learning_rate": 1.0350378615471173e-05,
"loss": 0.4273,
"step": 3964
},
{
"epoch": 1.9600791002348288,
"grad_norm": 0.10939728124243991,
"learning_rate": 1.0346487087183939e-05,
"loss": 0.3764,
"step": 3965
},
{
"epoch": 1.960573476702509,
"grad_norm": 0.09910073841526816,
"learning_rate": 1.0342595506360942e-05,
"loss": 0.4386,
"step": 3966
},
{
"epoch": 1.961067853170189,
"grad_norm": 0.11256561450828693,
"learning_rate": 1.0338703873592244e-05,
"loss": 0.4129,
"step": 3967
},
{
"epoch": 1.9615622296378692,
"grad_norm": 0.1071985355545336,
"learning_rate": 1.0334812189467912e-05,
"loss": 0.4005,
"step": 3968
},
{
"epoch": 1.9620566061055493,
"grad_norm": 0.10144656773650414,
"learning_rate": 1.0330920454578011e-05,
"loss": 0.4066,
"step": 3969
},
{
"epoch": 1.9625509825732295,
"grad_norm": 0.1039084408470797,
"learning_rate": 1.0327028669512629e-05,
"loss": 0.4195,
"step": 3970
},
{
"epoch": 1.9630453590409096,
"grad_norm": 0.1065236616283246,
"learning_rate": 1.0323136834861849e-05,
"loss": 0.3826,
"step": 3971
},
{
"epoch": 1.96353973550859,
"grad_norm": 0.10607138335682342,
"learning_rate": 1.0319244951215768e-05,
"loss": 0.4023,
"step": 3972
},
{
"epoch": 1.9640341119762699,
"grad_norm": 0.10528077111559749,
"learning_rate": 1.0315353019164489e-05,
"loss": 0.3889,
"step": 3973
},
{
"epoch": 1.9645284884439502,
"grad_norm": 0.09847487652798843,
"learning_rate": 1.0311461039298125e-05,
"loss": 0.3797,
"step": 3974
},
{
"epoch": 1.9650228649116301,
"grad_norm": 0.10665933450584324,
"learning_rate": 1.0307569012206788e-05,
"loss": 0.4004,
"step": 3975
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.10456349666235658,
"learning_rate": 1.0303676938480608e-05,
"loss": 0.3924,
"step": 3976
},
{
"epoch": 1.9660116178469904,
"grad_norm": 0.10990586088474608,
"learning_rate": 1.0299784818709714e-05,
"loss": 0.4202,
"step": 3977
},
{
"epoch": 1.9665059943146708,
"grad_norm": 0.10463262324242259,
"learning_rate": 1.0295892653484247e-05,
"loss": 0.4116,
"step": 3978
},
{
"epoch": 1.9670003707823507,
"grad_norm": 0.10618756897457407,
"learning_rate": 1.0292000443394347e-05,
"loss": 0.3901,
"step": 3979
},
{
"epoch": 1.967494747250031,
"grad_norm": 0.10642267337501943,
"learning_rate": 1.0288108189030175e-05,
"loss": 0.396,
"step": 3980
},
{
"epoch": 1.967989123717711,
"grad_norm": 0.10891308179853434,
"learning_rate": 1.0284215890981885e-05,
"loss": 0.3865,
"step": 3981
},
{
"epoch": 1.9684835001853913,
"grad_norm": 0.10685158028906201,
"learning_rate": 1.0280323549839642e-05,
"loss": 0.4052,
"step": 3982
},
{
"epoch": 1.9689778766530712,
"grad_norm": 0.10284859686598084,
"learning_rate": 1.0276431166193621e-05,
"loss": 0.4063,
"step": 3983
},
{
"epoch": 1.9694722531207516,
"grad_norm": 0.1061044264564093,
"learning_rate": 1.0272538740634002e-05,
"loss": 0.4031,
"step": 3984
},
{
"epoch": 1.9699666295884315,
"grad_norm": 0.10749230792917361,
"learning_rate": 1.0268646273750961e-05,
"loss": 0.3852,
"step": 3985
},
{
"epoch": 1.9704610060561119,
"grad_norm": 0.10484698732034438,
"learning_rate": 1.0264753766134703e-05,
"loss": 0.4123,
"step": 3986
},
{
"epoch": 1.9709553825237918,
"grad_norm": 0.10875477596168281,
"learning_rate": 1.0260861218375416e-05,
"loss": 0.3826,
"step": 3987
},
{
"epoch": 1.9714497589914721,
"grad_norm": 0.10728586449397624,
"learning_rate": 1.025696863106331e-05,
"loss": 0.4033,
"step": 3988
},
{
"epoch": 1.971944135459152,
"grad_norm": 0.1095406938552965,
"learning_rate": 1.0253076004788587e-05,
"loss": 0.3648,
"step": 3989
},
{
"epoch": 1.9724385119268324,
"grad_norm": 0.11105652768714047,
"learning_rate": 1.0249183340141469e-05,
"loss": 0.4018,
"step": 3990
},
{
"epoch": 1.9729328883945123,
"grad_norm": 0.11366364231646771,
"learning_rate": 1.0245290637712172e-05,
"loss": 0.4066,
"step": 3991
},
{
"epoch": 1.9734272648621927,
"grad_norm": 0.1062457807456308,
"learning_rate": 1.0241397898090933e-05,
"loss": 0.4053,
"step": 3992
},
{
"epoch": 1.9739216413298726,
"grad_norm": 0.11002812847417996,
"learning_rate": 1.023750512186797e-05,
"loss": 0.4015,
"step": 3993
},
{
"epoch": 1.974416017797553,
"grad_norm": 0.10920110570887116,
"learning_rate": 1.0233612309633537e-05,
"loss": 0.4156,
"step": 3994
},
{
"epoch": 1.9749103942652328,
"grad_norm": 0.10556289594151995,
"learning_rate": 1.0229719461977868e-05,
"loss": 0.4033,
"step": 3995
},
{
"epoch": 1.9754047707329132,
"grad_norm": 0.11296089481083674,
"learning_rate": 1.022582657949121e-05,
"loss": 0.3836,
"step": 3996
},
{
"epoch": 1.9758991472005931,
"grad_norm": 0.10509857852085516,
"learning_rate": 1.0221933662763828e-05,
"loss": 0.4023,
"step": 3997
},
{
"epoch": 1.9763935236682735,
"grad_norm": 0.103774575627117,
"learning_rate": 1.021804071238597e-05,
"loss": 0.3872,
"step": 3998
},
{
"epoch": 1.9768879001359534,
"grad_norm": 0.15794618898700208,
"learning_rate": 1.021414772894791e-05,
"loss": 0.4241,
"step": 3999
},
{
"epoch": 1.9773822766036337,
"grad_norm": 0.10999444054130886,
"learning_rate": 1.0210254713039913e-05,
"loss": 0.4347,
"step": 4000
},
{
"epoch": 1.9778766530713137,
"grad_norm": 0.18940473800812102,
"learning_rate": 1.0206361665252253e-05,
"loss": 0.4356,
"step": 4001
},
{
"epoch": 1.978371029538994,
"grad_norm": 0.1241091146308898,
"learning_rate": 1.0202468586175214e-05,
"loss": 0.4081,
"step": 4002
},
{
"epoch": 1.978865406006674,
"grad_norm": 0.10654911270647652,
"learning_rate": 1.0198575476399076e-05,
"loss": 0.394,
"step": 4003
},
{
"epoch": 1.9793597824743543,
"grad_norm": 0.10847875352499506,
"learning_rate": 1.0194682336514128e-05,
"loss": 0.402,
"step": 4004
},
{
"epoch": 1.9798541589420342,
"grad_norm": 0.10861261100554666,
"learning_rate": 1.0190789167110667e-05,
"loss": 0.4082,
"step": 4005
},
{
"epoch": 1.9803485354097146,
"grad_norm": 0.10599650047061653,
"learning_rate": 1.0186895968778987e-05,
"loss": 0.3747,
"step": 4006
},
{
"epoch": 1.9808429118773945,
"grad_norm": 0.10790891494503646,
"learning_rate": 1.0183002742109392e-05,
"loss": 0.4365,
"step": 4007
},
{
"epoch": 1.9813372883450748,
"grad_norm": 0.11454570397069111,
"learning_rate": 1.0179109487692188e-05,
"loss": 0.4151,
"step": 4008
},
{
"epoch": 1.981831664812755,
"grad_norm": 0.10844746480364768,
"learning_rate": 1.0175216206117684e-05,
"loss": 0.4055,
"step": 4009
},
{
"epoch": 1.982326041280435,
"grad_norm": 0.1050441748243041,
"learning_rate": 1.0171322897976203e-05,
"loss": 0.3909,
"step": 4010
},
{
"epoch": 1.9828204177481152,
"grad_norm": 0.10564747896014257,
"learning_rate": 1.0167429563858055e-05,
"loss": 0.4142,
"step": 4011
},
{
"epoch": 1.9833147942157954,
"grad_norm": 0.10564215742484731,
"learning_rate": 1.0163536204353565e-05,
"loss": 0.4001,
"step": 4012
},
{
"epoch": 1.9838091706834755,
"grad_norm": 0.10658484031045166,
"learning_rate": 1.0159642820053062e-05,
"loss": 0.4003,
"step": 4013
},
{
"epoch": 1.9843035471511556,
"grad_norm": 0.10452128139510584,
"learning_rate": 1.0155749411546877e-05,
"loss": 0.4069,
"step": 4014
},
{
"epoch": 1.9847979236188358,
"grad_norm": 0.10730418579794093,
"learning_rate": 1.015185597942534e-05,
"loss": 0.4188,
"step": 4015
},
{
"epoch": 1.985292300086516,
"grad_norm": 0.11355084167543626,
"learning_rate": 1.0147962524278794e-05,
"loss": 0.3974,
"step": 4016
},
{
"epoch": 1.985786676554196,
"grad_norm": 0.1062329872769411,
"learning_rate": 1.014406904669758e-05,
"loss": 0.4023,
"step": 4017
},
{
"epoch": 1.9862810530218762,
"grad_norm": 0.10430340928654899,
"learning_rate": 1.0140175547272033e-05,
"loss": 0.4196,
"step": 4018
},
{
"epoch": 1.9867754294895563,
"grad_norm": 0.10566504095843716,
"learning_rate": 1.0136282026592512e-05,
"loss": 0.3953,
"step": 4019
},
{
"epoch": 1.9872698059572365,
"grad_norm": 0.10482591996411461,
"learning_rate": 1.0132388485249365e-05,
"loss": 0.3915,
"step": 4020
},
{
"epoch": 1.9877641824249166,
"grad_norm": 0.10603539527678876,
"learning_rate": 1.0128494923832945e-05,
"loss": 0.4146,
"step": 4021
},
{
"epoch": 1.9882585588925967,
"grad_norm": 0.12038942602885827,
"learning_rate": 1.012460134293361e-05,
"loss": 0.4086,
"step": 4022
},
{
"epoch": 1.9887529353602769,
"grad_norm": 0.10863524096828853,
"learning_rate": 1.0120707743141722e-05,
"loss": 0.3846,
"step": 4023
},
{
"epoch": 1.989247311827957,
"grad_norm": 0.10342371889518548,
"learning_rate": 1.0116814125047643e-05,
"loss": 0.404,
"step": 4024
},
{
"epoch": 1.9897416882956371,
"grad_norm": 0.10235617528817573,
"learning_rate": 1.0112920489241738e-05,
"loss": 0.3996,
"step": 4025
},
{
"epoch": 1.9902360647633173,
"grad_norm": 0.10815727113833463,
"learning_rate": 1.0109026836314376e-05,
"loss": 0.4088,
"step": 4026
},
{
"epoch": 1.9907304412309974,
"grad_norm": 0.2501965703185539,
"learning_rate": 1.0105133166855927e-05,
"loss": 0.4383,
"step": 4027
},
{
"epoch": 1.9912248176986775,
"grad_norm": 0.10397739389963716,
"learning_rate": 1.0101239481456769e-05,
"loss": 0.3868,
"step": 4028
},
{
"epoch": 1.9917191941663577,
"grad_norm": 0.10361568894024217,
"learning_rate": 1.0097345780707271e-05,
"loss": 0.4152,
"step": 4029
},
{
"epoch": 1.9922135706340378,
"grad_norm": 0.10966483536474114,
"learning_rate": 1.009345206519782e-05,
"loss": 0.4497,
"step": 4030
},
{
"epoch": 1.992707947101718,
"grad_norm": 0.11373726039863183,
"learning_rate": 1.0089558335518789e-05,
"loss": 0.4088,
"step": 4031
},
{
"epoch": 1.993202323569398,
"grad_norm": 0.10668039476623498,
"learning_rate": 1.0085664592260569e-05,
"loss": 0.418,
"step": 4032
},
{
"epoch": 1.9936967000370782,
"grad_norm": 0.10424926453866894,
"learning_rate": 1.008177083601354e-05,
"loss": 0.3768,
"step": 4033
},
{
"epoch": 1.9941910765047584,
"grad_norm": 0.10200654922981883,
"learning_rate": 1.0077877067368087e-05,
"loss": 0.3792,
"step": 4034
},
{
"epoch": 1.9946854529724385,
"grad_norm": 0.10309879155025356,
"learning_rate": 1.0073983286914602e-05,
"loss": 0.3744,
"step": 4035
},
{
"epoch": 1.9951798294401186,
"grad_norm": 0.10626307790558484,
"learning_rate": 1.0070089495243476e-05,
"loss": 0.399,
"step": 4036
},
{
"epoch": 1.9956742059077988,
"grad_norm": 0.10946015975641386,
"learning_rate": 1.0066195692945098e-05,
"loss": 0.3772,
"step": 4037
},
{
"epoch": 1.996168582375479,
"grad_norm": 0.10197516845389532,
"learning_rate": 1.0062301880609867e-05,
"loss": 0.3923,
"step": 4038
},
{
"epoch": 1.996662958843159,
"grad_norm": 0.1063795614727839,
"learning_rate": 1.0058408058828173e-05,
"loss": 0.4151,
"step": 4039
},
{
"epoch": 1.9971573353108392,
"grad_norm": 0.10710697162793299,
"learning_rate": 1.0054514228190415e-05,
"loss": 0.3983,
"step": 4040
},
{
"epoch": 1.9976517117785193,
"grad_norm": 0.10409597262648232,
"learning_rate": 1.0050620389286994e-05,
"loss": 0.3893,
"step": 4041
},
{
"epoch": 1.9981460882461994,
"grad_norm": 0.10794175677487869,
"learning_rate": 1.0046726542708303e-05,
"loss": 0.4032,
"step": 4042
},
{
"epoch": 1.9986404647138796,
"grad_norm": 0.10896418730655225,
"learning_rate": 1.004283268904475e-05,
"loss": 0.4013,
"step": 4043
},
{
"epoch": 1.9991348411815597,
"grad_norm": 0.10578818945558066,
"learning_rate": 1.0038938828886732e-05,
"loss": 0.4148,
"step": 4044
},
{
"epoch": 1.9996292176492398,
"grad_norm": 0.10506601692559571,
"learning_rate": 1.0035044962824652e-05,
"loss": 0.4091,
"step": 4045
},
{
"epoch": 2.0,
"grad_norm": 0.10994589106934467,
"learning_rate": 1.0031151091448917e-05,
"loss": 0.3971,
"step": 4046
},
{
"epoch": 2.0004943764676804,
"grad_norm": 0.1648848200369849,
"learning_rate": 1.0027257215349928e-05,
"loss": 0.3014,
"step": 4047
},
{
"epoch": 2.0009887529353603,
"grad_norm": 0.1933400591880518,
"learning_rate": 1.0023363335118088e-05,
"loss": 0.2719,
"step": 4048
},
{
"epoch": 2.0009887529353603,
"eval_loss": 0.4953591525554657,
"eval_runtime": 100.9559,
"eval_samples_per_second": 300.666,
"eval_steps_per_second": 37.591,
"step": 4048
},
{
"epoch": 2.0014831294030406,
"grad_norm": 0.15473407428939845,
"learning_rate": 1.0019469451343806e-05,
"loss": 0.3377,
"step": 4049
},
{
"epoch": 2.0019775058707205,
"grad_norm": 0.5113539908084875,
"learning_rate": 1.001557556461749e-05,
"loss": 0.3005,
"step": 4050
},
{
"epoch": 2.002471882338401,
"grad_norm": 0.1361725186356404,
"learning_rate": 1.0011681675529545e-05,
"loss": 0.297,
"step": 4051
},
{
"epoch": 2.002966258806081,
"grad_norm": 0.15972866827022225,
"learning_rate": 1.0007787784670376e-05,
"loss": 0.2787,
"step": 4052
},
{
"epoch": 2.003460635273761,
"grad_norm": 0.14586285603148824,
"learning_rate": 1.0003893892630391e-05,
"loss": 0.2943,
"step": 4053
},
{
"epoch": 2.003955011741441,
"grad_norm": 0.15096715817433493,
"learning_rate": 1e-05,
"loss": 0.295,
"step": 4054
},
{
"epoch": 2.0044493882091214,
"grad_norm": 0.13650730017874188,
"learning_rate": 9.99610610736961e-06,
"loss": 0.2931,
"step": 4055
},
{
"epoch": 2.0049437646768014,
"grad_norm": 0.13789352116186646,
"learning_rate": 9.992212215329626e-06,
"loss": 0.3095,
"step": 4056
},
{
"epoch": 2.0054381411444817,
"grad_norm": 0.14292079924625284,
"learning_rate": 9.988318324470456e-06,
"loss": 0.3073,
"step": 4057
},
{
"epoch": 2.0059325176121616,
"grad_norm": 0.14250853791469503,
"learning_rate": 9.98442443538251e-06,
"loss": 0.2849,
"step": 4058
},
{
"epoch": 2.006426894079842,
"grad_norm": 0.12982842566196007,
"learning_rate": 9.980530548656195e-06,
"loss": 0.2893,
"step": 4059
},
{
"epoch": 2.006921270547522,
"grad_norm": 0.13674625046823624,
"learning_rate": 9.976636664881916e-06,
"loss": 0.2959,
"step": 4060
},
{
"epoch": 2.0074156470152023,
"grad_norm": 0.1305168361589308,
"learning_rate": 9.972742784650079e-06,
"loss": 0.2791,
"step": 4061
},
{
"epoch": 2.007910023482882,
"grad_norm": 0.13637426966077973,
"learning_rate": 9.968848908551088e-06,
"loss": 0.2983,
"step": 4062
},
{
"epoch": 2.0084043999505625,
"grad_norm": 0.12571032270393875,
"learning_rate": 9.964955037175348e-06,
"loss": 0.3297,
"step": 4063
},
{
"epoch": 2.0088987764182424,
"grad_norm": 0.12344825788592151,
"learning_rate": 9.96106117111327e-06,
"loss": 0.2814,
"step": 4064
},
{
"epoch": 2.009393152885923,
"grad_norm": 0.12091756603243017,
"learning_rate": 9.957167310955253e-06,
"loss": 0.2974,
"step": 4065
},
{
"epoch": 2.0098875293536027,
"grad_norm": 0.12136142968445648,
"learning_rate": 9.9532734572917e-06,
"loss": 0.2951,
"step": 4066
},
{
"epoch": 2.010381905821283,
"grad_norm": 0.11630126987645284,
"learning_rate": 9.94937961071301e-06,
"loss": 0.305,
"step": 4067
},
{
"epoch": 2.010876282288963,
"grad_norm": 0.1161143709524029,
"learning_rate": 9.945485771809585e-06,
"loss": 0.3003,
"step": 4068
},
{
"epoch": 2.0113706587566433,
"grad_norm": 0.11300254257900541,
"learning_rate": 9.94159194117183e-06,
"loss": 0.2861,
"step": 4069
},
{
"epoch": 2.0118650352243233,
"grad_norm": 0.1171301361808881,
"learning_rate": 9.937698119390137e-06,
"loss": 0.2909,
"step": 4070
},
{
"epoch": 2.0123594116920036,
"grad_norm": 0.11887687327160461,
"learning_rate": 9.933804307054904e-06,
"loss": 0.2918,
"step": 4071
},
{
"epoch": 2.0128537881596835,
"grad_norm": 0.1206593209397646,
"learning_rate": 9.929910504756529e-06,
"loss": 0.2794,
"step": 4072
},
{
"epoch": 2.013348164627364,
"grad_norm": 0.12681878991748408,
"learning_rate": 9.926016713085403e-06,
"loss": 0.3052,
"step": 4073
},
{
"epoch": 2.013842541095044,
"grad_norm": 0.12330683654285557,
"learning_rate": 9.922122932631915e-06,
"loss": 0.3174,
"step": 4074
},
{
"epoch": 2.014336917562724,
"grad_norm": 0.1173989183026726,
"learning_rate": 9.918229163986463e-06,
"loss": 0.2882,
"step": 4075
},
{
"epoch": 2.014831294030404,
"grad_norm": 0.12387970392386691,
"learning_rate": 9.914335407739435e-06,
"loss": 0.3164,
"step": 4076
},
{
"epoch": 2.0153256704980844,
"grad_norm": 0.12033280760122314,
"learning_rate": 9.910441664481213e-06,
"loss": 0.3026,
"step": 4077
},
{
"epoch": 2.0158200469657643,
"grad_norm": 0.11949327387370808,
"learning_rate": 9.906547934802184e-06,
"loss": 0.2932,
"step": 4078
},
{
"epoch": 2.0163144234334447,
"grad_norm": 0.11408876529169779,
"learning_rate": 9.90265421929273e-06,
"loss": 0.2965,
"step": 4079
},
{
"epoch": 2.0168087999011246,
"grad_norm": 0.12290163148475758,
"learning_rate": 9.898760518543236e-06,
"loss": 0.3028,
"step": 4080
},
{
"epoch": 2.017303176368805,
"grad_norm": 0.11967830526250917,
"learning_rate": 9.894866833144076e-06,
"loss": 0.2921,
"step": 4081
},
{
"epoch": 2.017797552836485,
"grad_norm": 0.11218246754177148,
"learning_rate": 9.890973163685627e-06,
"loss": 0.2798,
"step": 4082
},
{
"epoch": 2.0182919293041652,
"grad_norm": 0.11935662426287816,
"learning_rate": 9.887079510758268e-06,
"loss": 0.3135,
"step": 4083
},
{
"epoch": 2.018786305771845,
"grad_norm": 0.12970033257299074,
"learning_rate": 9.883185874952362e-06,
"loss": 0.3049,
"step": 4084
},
{
"epoch": 2.0192806822395255,
"grad_norm": 0.1253461994054506,
"learning_rate": 9.879292256858281e-06,
"loss": 0.293,
"step": 4085
},
{
"epoch": 2.0197750587072054,
"grad_norm": 0.11926169205533763,
"learning_rate": 9.875398657066391e-06,
"loss": 0.2964,
"step": 4086
},
{
"epoch": 2.0202694351748858,
"grad_norm": 0.12479702798266916,
"learning_rate": 9.871505076167057e-06,
"loss": 0.3045,
"step": 4087
},
{
"epoch": 2.0207638116425657,
"grad_norm": 0.11520234802287345,
"learning_rate": 9.867611514750637e-06,
"loss": 0.299,
"step": 4088
},
{
"epoch": 2.021258188110246,
"grad_norm": 0.12124166343751883,
"learning_rate": 9.86371797340749e-06,
"loss": 0.2766,
"step": 4089
},
{
"epoch": 2.021752564577926,
"grad_norm": 0.4041255377752972,
"learning_rate": 9.859824452727967e-06,
"loss": 0.3557,
"step": 4090
},
{
"epoch": 2.0222469410456063,
"grad_norm": 0.12573363330912685,
"learning_rate": 9.855930953302425e-06,
"loss": 0.2689,
"step": 4091
},
{
"epoch": 2.0227413175132862,
"grad_norm": 0.11392480282362348,
"learning_rate": 9.852037475721209e-06,
"loss": 0.2817,
"step": 4092
},
{
"epoch": 2.0232356939809666,
"grad_norm": 0.11236170262298845,
"learning_rate": 9.84814402057466e-06,
"loss": 0.2982,
"step": 4093
},
{
"epoch": 2.0237300704486465,
"grad_norm": 0.1153742981843893,
"learning_rate": 9.844250588453126e-06,
"loss": 0.293,
"step": 4094
},
{
"epoch": 2.024224446916327,
"grad_norm": 0.13525473300129068,
"learning_rate": 9.840357179946938e-06,
"loss": 0.2891,
"step": 4095
},
{
"epoch": 2.0247188233840068,
"grad_norm": 0.1121989650619322,
"learning_rate": 9.836463795646437e-06,
"loss": 0.2695,
"step": 4096
},
{
"epoch": 2.025213199851687,
"grad_norm": 0.1311667434312025,
"learning_rate": 9.832570436141948e-06,
"loss": 0.3073,
"step": 4097
},
{
"epoch": 2.025707576319367,
"grad_norm": 0.1211380455167081,
"learning_rate": 9.8286771020238e-06,
"loss": 0.3024,
"step": 4098
},
{
"epoch": 2.0262019527870474,
"grad_norm": 0.12411874517794132,
"learning_rate": 9.824783793882319e-06,
"loss": 0.2998,
"step": 4099
},
{
"epoch": 2.0266963292547273,
"grad_norm": 0.11961192339956568,
"learning_rate": 9.820890512307817e-06,
"loss": 0.2827,
"step": 4100
},
{
"epoch": 2.0271907057224077,
"grad_norm": 0.11763054231803592,
"learning_rate": 9.816997257890612e-06,
"loss": 0.2903,
"step": 4101
},
{
"epoch": 2.0276850821900876,
"grad_norm": 0.1210478877595317,
"learning_rate": 9.813104031221016e-06,
"loss": 0.3143,
"step": 4102
},
{
"epoch": 2.028179458657768,
"grad_norm": 0.11768702965057995,
"learning_rate": 9.809210832889338e-06,
"loss": 0.3081,
"step": 4103
},
{
"epoch": 2.028673835125448,
"grad_norm": 0.11739597887112002,
"learning_rate": 9.805317663485875e-06,
"loss": 0.296,
"step": 4104
},
{
"epoch": 2.029168211593128,
"grad_norm": 0.11063893185202443,
"learning_rate": 9.801424523600928e-06,
"loss": 0.2738,
"step": 4105
},
{
"epoch": 2.029662588060808,
"grad_norm": 0.11737626922422112,
"learning_rate": 9.797531413824787e-06,
"loss": 0.2897,
"step": 4106
},
{
"epoch": 2.0301569645284885,
"grad_norm": 0.12708739722103504,
"learning_rate": 9.793638334747747e-06,
"loss": 0.3009,
"step": 4107
},
{
"epoch": 2.0306513409961684,
"grad_norm": 0.11464108175266491,
"learning_rate": 9.78974528696009e-06,
"loss": 0.2935,
"step": 4108
},
{
"epoch": 2.0311457174638488,
"grad_norm": 0.14756169195823912,
"learning_rate": 9.785852271052092e-06,
"loss": 0.2963,
"step": 4109
},
{
"epoch": 2.0316400939315287,
"grad_norm": 0.11316695687997387,
"learning_rate": 9.781959287614032e-06,
"loss": 0.2754,
"step": 4110
},
{
"epoch": 2.032134470399209,
"grad_norm": 0.12310058149284947,
"learning_rate": 9.778066337236177e-06,
"loss": 0.3031,
"step": 4111
},
{
"epoch": 2.032628846866889,
"grad_norm": 0.11540467275350103,
"learning_rate": 9.77417342050879e-06,
"loss": 0.303,
"step": 4112
},
{
"epoch": 2.0331232233345693,
"grad_norm": 0.12132725616682513,
"learning_rate": 9.770280538022137e-06,
"loss": 0.2986,
"step": 4113
},
{
"epoch": 2.033617599802249,
"grad_norm": 0.12606281844894834,
"learning_rate": 9.766387690366466e-06,
"loss": 0.2997,
"step": 4114
},
{
"epoch": 2.0341119762699296,
"grad_norm": 0.11089188649297109,
"learning_rate": 9.762494878132033e-06,
"loss": 0.2938,
"step": 4115
},
{
"epoch": 2.0346063527376095,
"grad_norm": 0.11608485265062657,
"learning_rate": 9.758602101909074e-06,
"loss": 0.3077,
"step": 4116
},
{
"epoch": 2.03510072920529,
"grad_norm": 0.11522714104035288,
"learning_rate": 9.754709362287826e-06,
"loss": 0.3014,
"step": 4117
},
{
"epoch": 2.0355951056729698,
"grad_norm": 0.11473044756845094,
"learning_rate": 9.750816659858536e-06,
"loss": 0.3001,
"step": 4118
},
{
"epoch": 2.03608948214065,
"grad_norm": 0.21179710681809882,
"learning_rate": 9.746923995211417e-06,
"loss": 0.3166,
"step": 4119
},
{
"epoch": 2.0365838586083305,
"grad_norm": 0.9271761805809972,
"learning_rate": 9.743031368936696e-06,
"loss": 0.3179,
"step": 4120
},
{
"epoch": 2.0370782350760104,
"grad_norm": 0.12132767587794815,
"learning_rate": 9.739138781624586e-06,
"loss": 0.298,
"step": 4121
},
{
"epoch": 2.0375726115436907,
"grad_norm": 0.12384538290240961,
"learning_rate": 9.735246233865302e-06,
"loss": 0.3187,
"step": 4122
},
{
"epoch": 2.0380669880113707,
"grad_norm": 0.12594707440583836,
"learning_rate": 9.731353726249038e-06,
"loss": 0.288,
"step": 4123
},
{
"epoch": 2.038561364479051,
"grad_norm": 0.11703108145788134,
"learning_rate": 9.727461259366003e-06,
"loss": 0.3098,
"step": 4124
},
{
"epoch": 2.039055740946731,
"grad_norm": 0.12522762410544327,
"learning_rate": 9.723568833806382e-06,
"loss": 0.2961,
"step": 4125
},
{
"epoch": 2.0395501174144113,
"grad_norm": 0.118428637210629,
"learning_rate": 9.719676450160361e-06,
"loss": 0.2726,
"step": 4126
},
{
"epoch": 2.040044493882091,
"grad_norm": 0.12771627388029388,
"learning_rate": 9.71578410901812e-06,
"loss": 0.3108,
"step": 4127
},
{
"epoch": 2.0405388703497715,
"grad_norm": 0.12426196835794255,
"learning_rate": 9.711891810969826e-06,
"loss": 0.2955,
"step": 4128
},
{
"epoch": 2.0410332468174515,
"grad_norm": 0.12322359079348434,
"learning_rate": 9.707999556605653e-06,
"loss": 0.295,
"step": 4129
},
{
"epoch": 2.041527623285132,
"grad_norm": 0.11962118163052984,
"learning_rate": 9.704107346515756e-06,
"loss": 0.2964,
"step": 4130
},
{
"epoch": 2.0420219997528117,
"grad_norm": 0.11963378558853091,
"learning_rate": 9.700215181290287e-06,
"loss": 0.2986,
"step": 4131
},
{
"epoch": 2.042516376220492,
"grad_norm": 0.12127422456283007,
"learning_rate": 9.696323061519397e-06,
"loss": 0.2998,
"step": 4132
},
{
"epoch": 2.043010752688172,
"grad_norm": 0.12307478562334848,
"learning_rate": 9.692430987793215e-06,
"loss": 0.2953,
"step": 4133
},
{
"epoch": 2.0435051291558524,
"grad_norm": 0.11699033086731016,
"learning_rate": 9.688538960701878e-06,
"loss": 0.2762,
"step": 4134
},
{
"epoch": 2.0439995056235323,
"grad_norm": 0.11792816009039118,
"learning_rate": 9.684646980835513e-06,
"loss": 0.2938,
"step": 4135
},
{
"epoch": 2.0444938820912126,
"grad_norm": 0.11781744094598263,
"learning_rate": 9.680755048784235e-06,
"loss": 0.2787,
"step": 4136
},
{
"epoch": 2.0449882585588925,
"grad_norm": 0.11400666731425942,
"learning_rate": 9.676863165138156e-06,
"loss": 0.2754,
"step": 4137
},
{
"epoch": 2.045482635026573,
"grad_norm": 0.12318501426969412,
"learning_rate": 9.672971330487375e-06,
"loss": 0.3062,
"step": 4138
},
{
"epoch": 2.045977011494253,
"grad_norm": 0.13638692324948373,
"learning_rate": 9.669079545421989e-06,
"loss": 0.2857,
"step": 4139
},
{
"epoch": 2.046471387961933,
"grad_norm": 0.12351944671143159,
"learning_rate": 9.66518781053209e-06,
"loss": 0.3065,
"step": 4140
},
{
"epoch": 2.046965764429613,
"grad_norm": 0.11790185210423368,
"learning_rate": 9.661296126407757e-06,
"loss": 0.3042,
"step": 4141
},
{
"epoch": 2.0474601408972934,
"grad_norm": 0.12262903599601586,
"learning_rate": 9.657404493639061e-06,
"loss": 0.3098,
"step": 4142
},
{
"epoch": 2.0479545173649734,
"grad_norm": 0.1204168094684351,
"learning_rate": 9.653512912816067e-06,
"loss": 0.2964,
"step": 4143
},
{
"epoch": 2.0484488938326537,
"grad_norm": 0.12548312019302846,
"learning_rate": 9.649621384528832e-06,
"loss": 0.2952,
"step": 4144
},
{
"epoch": 2.0489432703003336,
"grad_norm": 0.12856453668985998,
"learning_rate": 9.645729909367402e-06,
"loss": 0.2806,
"step": 4145
},
{
"epoch": 2.049437646768014,
"grad_norm": 0.12013646825832061,
"learning_rate": 9.641838487921827e-06,
"loss": 0.2987,
"step": 4146
},
{
"epoch": 2.049932023235694,
"grad_norm": 0.11647620209389309,
"learning_rate": 9.637947120782131e-06,
"loss": 0.2887,
"step": 4147
},
{
"epoch": 2.0504263997033743,
"grad_norm": 0.11807797805349733,
"learning_rate": 9.634055808538347e-06,
"loss": 0.2919,
"step": 4148
},
{
"epoch": 2.050920776171054,
"grad_norm": 0.1193218210005017,
"learning_rate": 9.630164551780484e-06,
"loss": 0.2842,
"step": 4149
},
{
"epoch": 2.0514151526387345,
"grad_norm": 0.12485788726114766,
"learning_rate": 9.626273351098547e-06,
"loss": 0.2966,
"step": 4150
},
{
"epoch": 2.0519095291064144,
"grad_norm": 0.12446715900921028,
"learning_rate": 9.622382207082548e-06,
"loss": 0.3184,
"step": 4151
},
{
"epoch": 2.052403905574095,
"grad_norm": 0.12725676670572209,
"learning_rate": 9.618491120322468e-06,
"loss": 0.2819,
"step": 4152
},
{
"epoch": 2.0528982820417747,
"grad_norm": 0.11475648926365163,
"learning_rate": 9.614600091408293e-06,
"loss": 0.2995,
"step": 4153
},
{
"epoch": 2.053392658509455,
"grad_norm": 1.8057722163898087,
"learning_rate": 9.610709120929993e-06,
"loss": 0.4497,
"step": 4154
},
{
"epoch": 2.053887034977135,
"grad_norm": 0.14322902883048616,
"learning_rate": 9.60681820947754e-06,
"loss": 0.3091,
"step": 4155
},
{
"epoch": 2.0543814114448153,
"grad_norm": 0.14390908625864607,
"learning_rate": 9.602927357640876e-06,
"loss": 0.3031,
"step": 4156
},
{
"epoch": 2.0548757879124953,
"grad_norm": 0.11967208601236681,
"learning_rate": 9.599036566009961e-06,
"loss": 0.2914,
"step": 4157
},
{
"epoch": 2.0553701643801756,
"grad_norm": 0.1420254357415399,
"learning_rate": 9.595145835174729e-06,
"loss": 0.2916,
"step": 4158
},
{
"epoch": 2.0558645408478555,
"grad_norm": 0.13578468987722064,
"learning_rate": 9.591255165725104e-06,
"loss": 0.2853,
"step": 4159
},
{
"epoch": 2.056358917315536,
"grad_norm": 0.13473606842646796,
"learning_rate": 9.587364558251008e-06,
"loss": 0.2713,
"step": 4160
},
{
"epoch": 2.056853293783216,
"grad_norm": 0.14919270919392338,
"learning_rate": 9.583474013342347e-06,
"loss": 0.3208,
"step": 4161
},
{
"epoch": 2.057347670250896,
"grad_norm": 0.1446531586112002,
"learning_rate": 9.579583531589027e-06,
"loss": 0.2827,
"step": 4162
},
{
"epoch": 2.057842046718576,
"grad_norm": 0.12965464595593246,
"learning_rate": 9.575693113580935e-06,
"loss": 0.2862,
"step": 4163
},
{
"epoch": 2.0583364231862564,
"grad_norm": 0.12877628577271863,
"learning_rate": 9.57180275990795e-06,
"loss": 0.2956,
"step": 4164
},
{
"epoch": 2.0588307996539363,
"grad_norm": 0.13361732234997262,
"learning_rate": 9.567912471159949e-06,
"loss": 0.2916,
"step": 4165
},
{
"epoch": 2.0593251761216167,
"grad_norm": 0.12778721553599773,
"learning_rate": 9.564022247926786e-06,
"loss": 0.3315,
"step": 4166
},
{
"epoch": 2.0598195525892966,
"grad_norm": 0.13561524382804072,
"learning_rate": 9.560132090798314e-06,
"loss": 0.2812,
"step": 4167
},
{
"epoch": 2.060313929056977,
"grad_norm": 0.12605596400623545,
"learning_rate": 9.556242000364378e-06,
"loss": 0.2985,
"step": 4168
},
{
"epoch": 2.060808305524657,
"grad_norm": 0.12798856110148737,
"learning_rate": 9.552351977214806e-06,
"loss": 0.2995,
"step": 4169
},
{
"epoch": 2.0613026819923372,
"grad_norm": 0.12582158192147996,
"learning_rate": 9.54846202193942e-06,
"loss": 0.3003,
"step": 4170
},
{
"epoch": 2.061797058460017,
"grad_norm": 0.12313792776246422,
"learning_rate": 9.544572135128034e-06,
"loss": 0.2902,
"step": 4171
},
{
"epoch": 2.0622914349276975,
"grad_norm": 0.12892045705708557,
"learning_rate": 9.540682317370436e-06,
"loss": 0.2952,
"step": 4172
},
{
"epoch": 2.0627858113953774,
"grad_norm": 0.12139479685176105,
"learning_rate": 9.536792569256429e-06,
"loss": 0.293,
"step": 4173
},
{
"epoch": 2.063280187863058,
"grad_norm": 0.1252488812341344,
"learning_rate": 9.532902891375788e-06,
"loss": 0.2924,
"step": 4174
},
{
"epoch": 2.0637745643307377,
"grad_norm": 0.13109171533769753,
"learning_rate": 9.52901328431828e-06,
"loss": 0.2997,
"step": 4175
},
{
"epoch": 2.064268940798418,
"grad_norm": 0.13136985592796208,
"learning_rate": 9.525123748673663e-06,
"loss": 0.3114,
"step": 4176
},
{
"epoch": 2.064763317266098,
"grad_norm": 0.1303826124981578,
"learning_rate": 9.521234285031682e-06,
"loss": 0.2885,
"step": 4177
},
{
"epoch": 2.0652576937337783,
"grad_norm": 0.17753206969244528,
"learning_rate": 9.51734489398208e-06,
"loss": 0.2914,
"step": 4178
},
{
"epoch": 2.0657520702014582,
"grad_norm": 0.11752206092382925,
"learning_rate": 9.513455576114575e-06,
"loss": 0.3056,
"step": 4179
},
{
"epoch": 2.0662464466691386,
"grad_norm": 0.12673713668035522,
"learning_rate": 9.509566332018885e-06,
"loss": 0.2872,
"step": 4180
},
{
"epoch": 2.0667408231368185,
"grad_norm": 0.11831534162830235,
"learning_rate": 9.505677162284713e-06,
"loss": 0.2944,
"step": 4181
},
{
"epoch": 2.067235199604499,
"grad_norm": 0.12050849948530418,
"learning_rate": 9.501788067501748e-06,
"loss": 0.2987,
"step": 4182
},
{
"epoch": 2.0677295760721788,
"grad_norm": 0.13804951993418485,
"learning_rate": 9.497899048259668e-06,
"loss": 0.2991,
"step": 4183
},
{
"epoch": 2.068223952539859,
"grad_norm": 0.12472333341124686,
"learning_rate": 9.494010105148148e-06,
"loss": 0.2963,
"step": 4184
},
{
"epoch": 2.068718329007539,
"grad_norm": 0.11651575504758166,
"learning_rate": 9.49012123875684e-06,
"loss": 0.2852,
"step": 4185
},
{
"epoch": 2.0692127054752194,
"grad_norm": 0.12043453937846024,
"learning_rate": 9.48623244967539e-06,
"loss": 0.285,
"step": 4186
},
{
"epoch": 2.0697070819428993,
"grad_norm": 0.13285039639998006,
"learning_rate": 9.482343738493436e-06,
"loss": 0.3138,
"step": 4187
},
{
"epoch": 2.0702014584105797,
"grad_norm": 0.12661614545533095,
"learning_rate": 9.478455105800594e-06,
"loss": 0.3357,
"step": 4188
},
{
"epoch": 2.0706958348782596,
"grad_norm": 0.13958969128161414,
"learning_rate": 9.47456655218648e-06,
"loss": 0.2926,
"step": 4189
},
{
"epoch": 2.07119021134594,
"grad_norm": 0.11848032916645435,
"learning_rate": 9.47067807824069e-06,
"loss": 0.2862,
"step": 4190
},
{
"epoch": 2.07168458781362,
"grad_norm": 0.11593886389219106,
"learning_rate": 9.466789684552808e-06,
"loss": 0.2866,
"step": 4191
},
{
"epoch": 2.0721789642813,
"grad_norm": 0.12078485297713197,
"learning_rate": 9.462901371712408e-06,
"loss": 0.2828,
"step": 4192
},
{
"epoch": 2.07267334074898,
"grad_norm": 0.12425969217607614,
"learning_rate": 9.459013140309052e-06,
"loss": 0.3113,
"step": 4193
},
{
"epoch": 2.0731677172166605,
"grad_norm": 0.1191025489924173,
"learning_rate": 9.455124990932289e-06,
"loss": 0.2994,
"step": 4194
},
{
"epoch": 2.073662093684341,
"grad_norm": 0.12576429447082302,
"learning_rate": 9.451236924171657e-06,
"loss": 0.2709,
"step": 4195
},
{
"epoch": 2.0741564701520208,
"grad_norm": 0.11546469277672543,
"learning_rate": 9.447348940616683e-06,
"loss": 0.3154,
"step": 4196
},
{
"epoch": 2.074650846619701,
"grad_norm": 0.12098936431214387,
"learning_rate": 9.443461040856873e-06,
"loss": 0.2904,
"step": 4197
},
{
"epoch": 2.075145223087381,
"grad_norm": 0.12390098642199593,
"learning_rate": 9.439573225481729e-06,
"loss": 0.2927,
"step": 4198
},
{
"epoch": 2.0756395995550614,
"grad_norm": 0.11993692560949636,
"learning_rate": 9.435685495080731e-06,
"loss": 0.2888,
"step": 4199
},
{
"epoch": 2.0761339760227413,
"grad_norm": 0.12204661443349756,
"learning_rate": 9.431797850243367e-06,
"loss": 0.302,
"step": 4200
},
{
"epoch": 2.0766283524904217,
"grad_norm": 0.1441694222226582,
"learning_rate": 9.427910291559083e-06,
"loss": 0.2975,
"step": 4201
},
{
"epoch": 2.0771227289581016,
"grad_norm": 0.11969125748165858,
"learning_rate": 9.424022819617332e-06,
"loss": 0.286,
"step": 4202
},
{
"epoch": 2.077617105425782,
"grad_norm": 0.12242991279360095,
"learning_rate": 9.420135435007547e-06,
"loss": 0.2929,
"step": 4203
},
{
"epoch": 2.078111481893462,
"grad_norm": 0.13328166746021083,
"learning_rate": 9.416248138319152e-06,
"loss": 0.2925,
"step": 4204
},
{
"epoch": 2.078605858361142,
"grad_norm": 0.12883367090522133,
"learning_rate": 9.412360930141544e-06,
"loss": 0.2983,
"step": 4205
},
{
"epoch": 2.079100234828822,
"grad_norm": 0.1165832243801497,
"learning_rate": 9.40847381106413e-06,
"loss": 0.2998,
"step": 4206
},
{
"epoch": 2.0795946112965025,
"grad_norm": 0.12031796724850037,
"learning_rate": 9.404586781676286e-06,
"loss": 0.2829,
"step": 4207
},
{
"epoch": 2.0800889877641824,
"grad_norm": 0.1250075027750069,
"learning_rate": 9.400699842567376e-06,
"loss": 0.2869,
"step": 4208
},
{
"epoch": 2.0805833642318627,
"grad_norm": 0.13406279567888466,
"learning_rate": 9.396812994326756e-06,
"loss": 0.3007,
"step": 4209
},
{
"epoch": 2.0810777406995427,
"grad_norm": 0.11573387690161387,
"learning_rate": 9.392926237543765e-06,
"loss": 0.2753,
"step": 4210
},
{
"epoch": 2.081572117167223,
"grad_norm": 0.1155297772216056,
"learning_rate": 9.389039572807727e-06,
"loss": 0.2862,
"step": 4211
},
{
"epoch": 2.082066493634903,
"grad_norm": 0.13276776671832682,
"learning_rate": 9.385153000707958e-06,
"loss": 0.317,
"step": 4212
},
{
"epoch": 2.0825608701025833,
"grad_norm": 0.11372282984015288,
"learning_rate": 9.381266521833751e-06,
"loss": 0.2824,
"step": 4213
},
{
"epoch": 2.083055246570263,
"grad_norm": 0.1256198078646515,
"learning_rate": 9.377380136774394e-06,
"loss": 0.2929,
"step": 4214
},
{
"epoch": 2.0835496230379436,
"grad_norm": 0.11909592873360855,
"learning_rate": 9.373493846119153e-06,
"loss": 0.3212,
"step": 4215
},
{
"epoch": 2.0840439995056235,
"grad_norm": 1.0497647582306033,
"learning_rate": 9.36960765045728e-06,
"loss": 0.2968,
"step": 4216
},
{
"epoch": 2.084538375973304,
"grad_norm": 0.12138085242570894,
"learning_rate": 9.365721550378021e-06,
"loss": 0.2946,
"step": 4217
},
{
"epoch": 2.0850327524409837,
"grad_norm": 0.12194241870531049,
"learning_rate": 9.3618355464706e-06,
"loss": 0.2927,
"step": 4218
},
{
"epoch": 2.085527128908664,
"grad_norm": 0.11569838539894828,
"learning_rate": 9.357949639324229e-06,
"loss": 0.2837,
"step": 4219
},
{
"epoch": 2.086021505376344,
"grad_norm": 0.1257876662782587,
"learning_rate": 9.354063829528105e-06,
"loss": 0.287,
"step": 4220
},
{
"epoch": 2.0865158818440244,
"grad_norm": 0.11726090992640656,
"learning_rate": 9.350178117671405e-06,
"loss": 0.2985,
"step": 4221
},
{
"epoch": 2.0870102583117043,
"grad_norm": 0.12650881102829303,
"learning_rate": 9.346292504343306e-06,
"loss": 0.2801,
"step": 4222
},
{
"epoch": 2.0875046347793846,
"grad_norm": 0.1259087731864783,
"learning_rate": 9.342406990132954e-06,
"loss": 0.3238,
"step": 4223
},
{
"epoch": 2.0879990112470646,
"grad_norm": 0.12007470376723725,
"learning_rate": 9.338521575629487e-06,
"loss": 0.302,
"step": 4224
},
{
"epoch": 2.088493387714745,
"grad_norm": 0.11813701170167651,
"learning_rate": 9.334636261422027e-06,
"loss": 0.2794,
"step": 4225
},
{
"epoch": 2.088987764182425,
"grad_norm": 0.11500755639545045,
"learning_rate": 9.33075104809968e-06,
"loss": 0.2887,
"step": 4226
},
{
"epoch": 2.089482140650105,
"grad_norm": 0.12263186364208599,
"learning_rate": 9.326865936251537e-06,
"loss": 0.314,
"step": 4227
},
{
"epoch": 2.089976517117785,
"grad_norm": 0.1360385353487021,
"learning_rate": 9.322980926466678e-06,
"loss": 0.3196,
"step": 4228
},
{
"epoch": 2.0904708935854655,
"grad_norm": 0.11851121585737852,
"learning_rate": 9.319096019334163e-06,
"loss": 0.3009,
"step": 4229
},
{
"epoch": 2.0909652700531454,
"grad_norm": 0.12468497632988843,
"learning_rate": 9.315211215443037e-06,
"loss": 0.2943,
"step": 4230
},
{
"epoch": 2.0914596465208257,
"grad_norm": 0.12674530701701878,
"learning_rate": 9.311326515382326e-06,
"loss": 0.3242,
"step": 4231
},
{
"epoch": 2.0919540229885056,
"grad_norm": 0.14625002314183755,
"learning_rate": 9.307441919741041e-06,
"loss": 0.2983,
"step": 4232
},
{
"epoch": 2.092448399456186,
"grad_norm": 0.11933575615753068,
"learning_rate": 9.303557429108193e-06,
"loss": 0.2913,
"step": 4233
},
{
"epoch": 2.092942775923866,
"grad_norm": 0.11912835094996306,
"learning_rate": 9.299673044072753e-06,
"loss": 0.2959,
"step": 4234
},
{
"epoch": 2.0934371523915463,
"grad_norm": 0.12161057430307255,
"learning_rate": 9.295788765223692e-06,
"loss": 0.2998,
"step": 4235
},
{
"epoch": 2.093931528859226,
"grad_norm": 0.11982955455833183,
"learning_rate": 9.291904593149957e-06,
"loss": 0.2835,
"step": 4236
},
{
"epoch": 2.0944259053269065,
"grad_norm": 0.12071558195206011,
"learning_rate": 9.288020528440484e-06,
"loss": 0.2897,
"step": 4237
},
{
"epoch": 2.0949202817945864,
"grad_norm": 0.11961798088681178,
"learning_rate": 9.284136571684183e-06,
"loss": 0.309,
"step": 4238
},
{
"epoch": 2.095414658262267,
"grad_norm": 0.12687381053137833,
"learning_rate": 9.280252723469965e-06,
"loss": 0.3133,
"step": 4239
},
{
"epoch": 2.0959090347299467,
"grad_norm": 0.12456191408978508,
"learning_rate": 9.276368984386715e-06,
"loss": 0.2991,
"step": 4240
},
{
"epoch": 2.096403411197627,
"grad_norm": 0.12898359589702232,
"learning_rate": 9.272485355023293e-06,
"loss": 0.2987,
"step": 4241
},
{
"epoch": 2.096897787665307,
"grad_norm": 0.11910325649752784,
"learning_rate": 9.268601835968555e-06,
"loss": 0.287,
"step": 4242
},
{
"epoch": 2.0973921641329873,
"grad_norm": 0.12236622270449463,
"learning_rate": 9.264718427811333e-06,
"loss": 0.2898,
"step": 4243
},
{
"epoch": 2.0978865406006673,
"grad_norm": 0.1322488509768057,
"learning_rate": 9.260835131140448e-06,
"loss": 0.3203,
"step": 4244
},
{
"epoch": 2.0983809170683476,
"grad_norm": 0.12390151439275676,
"learning_rate": 9.256951946544701e-06,
"loss": 0.3025,
"step": 4245
},
{
"epoch": 2.0988752935360275,
"grad_norm": 0.11720018586497528,
"learning_rate": 9.253068874612876e-06,
"loss": 0.2978,
"step": 4246
},
{
"epoch": 2.099369670003708,
"grad_norm": 0.12082747796148958,
"learning_rate": 9.24918591593374e-06,
"loss": 0.2832,
"step": 4247
},
{
"epoch": 2.099864046471388,
"grad_norm": 0.12203505723882983,
"learning_rate": 9.245303071096038e-06,
"loss": 0.2923,
"step": 4248
},
{
"epoch": 2.100358422939068,
"grad_norm": 0.11439111666675726,
"learning_rate": 9.241420340688507e-06,
"loss": 0.3019,
"step": 4249
},
{
"epoch": 2.100852799406748,
"grad_norm": 0.12296579026125348,
"learning_rate": 9.237537725299861e-06,
"loss": 0.3029,
"step": 4250
},
{
"epoch": 2.1013471758744284,
"grad_norm": 0.11178387568610598,
"learning_rate": 9.2336552255188e-06,
"loss": 0.311,
"step": 4251
},
{
"epoch": 2.1018415523421083,
"grad_norm": 0.1155852862720769,
"learning_rate": 9.229772841934e-06,
"loss": 0.2902,
"step": 4252
},
{
"epoch": 2.1023359288097887,
"grad_norm": 0.11521964435726365,
"learning_rate": 9.225890575134128e-06,
"loss": 0.2939,
"step": 4253
},
{
"epoch": 2.1028303052774686,
"grad_norm": 0.11713912493030701,
"learning_rate": 9.222008425707822e-06,
"loss": 0.2867,
"step": 4254
},
{
"epoch": 2.103324681745149,
"grad_norm": 0.12218018368184799,
"learning_rate": 9.218126394243716e-06,
"loss": 0.3154,
"step": 4255
},
{
"epoch": 2.103819058212829,
"grad_norm": 0.11947246734332446,
"learning_rate": 9.214244481330419e-06,
"loss": 0.2814,
"step": 4256
},
{
"epoch": 2.1043134346805092,
"grad_norm": 0.12005809435846532,
"learning_rate": 9.210362687556518e-06,
"loss": 0.326,
"step": 4257
},
{
"epoch": 2.104807811148189,
"grad_norm": 0.11502184154676295,
"learning_rate": 9.20648101351059e-06,
"loss": 0.2971,
"step": 4258
},
{
"epoch": 2.1053021876158695,
"grad_norm": 0.12136667226548299,
"learning_rate": 9.202599459781183e-06,
"loss": 0.2972,
"step": 4259
},
{
"epoch": 2.1057965640835494,
"grad_norm": 0.1141526904403056,
"learning_rate": 9.19871802695684e-06,
"loss": 0.3057,
"step": 4260
},
{
"epoch": 2.10629094055123,
"grad_norm": 0.11627121195921922,
"learning_rate": 9.19483671562608e-06,
"loss": 0.3075,
"step": 4261
},
{
"epoch": 2.1067853170189097,
"grad_norm": 0.12259421820891576,
"learning_rate": 9.1909555263774e-06,
"loss": 0.2926,
"step": 4262
},
{
"epoch": 2.10727969348659,
"grad_norm": 0.12210216885457245,
"learning_rate": 9.187074459799285e-06,
"loss": 0.2971,
"step": 4263
},
{
"epoch": 2.10777406995427,
"grad_norm": 0.12210769693030604,
"learning_rate": 9.183193516480193e-06,
"loss": 0.3061,
"step": 4264
},
{
"epoch": 2.1082684464219503,
"grad_norm": 0.12093666050516506,
"learning_rate": 9.179312697008569e-06,
"loss": 0.2947,
"step": 4265
},
{
"epoch": 2.1087628228896302,
"grad_norm": 0.1254051719579555,
"learning_rate": 9.17543200197284e-06,
"loss": 0.2927,
"step": 4266
},
{
"epoch": 2.1092571993573106,
"grad_norm": 0.12575561274559222,
"learning_rate": 9.171551431961416e-06,
"loss": 0.3006,
"step": 4267
},
{
"epoch": 2.1097515758249905,
"grad_norm": 0.11989794486982037,
"learning_rate": 9.167670987562677e-06,
"loss": 0.3158,
"step": 4268
},
{
"epoch": 2.110245952292671,
"grad_norm": 0.12001983881537599,
"learning_rate": 9.163790669364998e-06,
"loss": 0.3026,
"step": 4269
},
{
"epoch": 2.1107403287603512,
"grad_norm": 0.12519281016973405,
"learning_rate": 9.159910477956724e-06,
"loss": 0.3058,
"step": 4270
},
{
"epoch": 2.111234705228031,
"grad_norm": 0.12239839529010735,
"learning_rate": 9.156030413926188e-06,
"loss": 0.2957,
"step": 4271
},
{
"epoch": 2.1117290816957115,
"grad_norm": 0.12071289124262145,
"learning_rate": 9.152150477861701e-06,
"loss": 0.2927,
"step": 4272
},
{
"epoch": 2.1122234581633914,
"grad_norm": 0.11678502582945566,
"learning_rate": 9.148270670351552e-06,
"loss": 0.2911,
"step": 4273
},
{
"epoch": 2.1127178346310718,
"grad_norm": 0.12417066364117824,
"learning_rate": 9.144390991984014e-06,
"loss": 0.301,
"step": 4274
},
{
"epoch": 2.1132122110987517,
"grad_norm": 0.12621826679574383,
"learning_rate": 9.140511443347341e-06,
"loss": 0.2865,
"step": 4275
},
{
"epoch": 2.113706587566432,
"grad_norm": 0.11320242640006847,
"learning_rate": 9.136632025029762e-06,
"loss": 0.2854,
"step": 4276
},
{
"epoch": 2.114200964034112,
"grad_norm": 0.125658100737923,
"learning_rate": 9.132752737619493e-06,
"loss": 0.2993,
"step": 4277
},
{
"epoch": 2.1146953405017923,
"grad_norm": 0.11475891589537607,
"learning_rate": 9.128873581704726e-06,
"loss": 0.2898,
"step": 4278
},
{
"epoch": 2.1151897169694722,
"grad_norm": 0.11494694833191695,
"learning_rate": 9.124994557873638e-06,
"loss": 0.2942,
"step": 4279
},
{
"epoch": 2.1156840934371526,
"grad_norm": 0.1233563622360292,
"learning_rate": 9.121115666714375e-06,
"loss": 0.3159,
"step": 4280
},
{
"epoch": 2.1161784699048325,
"grad_norm": 0.11812467453897514,
"learning_rate": 9.11723690881507e-06,
"loss": 0.284,
"step": 4281
},
{
"epoch": 2.116672846372513,
"grad_norm": 0.12447809453025886,
"learning_rate": 9.113358284763846e-06,
"loss": 0.3091,
"step": 4282
},
{
"epoch": 2.1171672228401928,
"grad_norm": 0.11764842438337414,
"learning_rate": 9.109479795148787e-06,
"loss": 0.2885,
"step": 4283
},
{
"epoch": 2.117661599307873,
"grad_norm": 0.11375809492337931,
"learning_rate": 9.105601440557966e-06,
"loss": 0.2817,
"step": 4284
},
{
"epoch": 2.118155975775553,
"grad_norm": 0.12049670936594223,
"learning_rate": 9.101723221579437e-06,
"loss": 0.2904,
"step": 4285
},
{
"epoch": 2.1186503522432334,
"grad_norm": 0.12463144658821891,
"learning_rate": 9.097845138801232e-06,
"loss": 0.2921,
"step": 4286
},
{
"epoch": 2.1191447287109133,
"grad_norm": 0.12043859417202335,
"learning_rate": 9.093967192811351e-06,
"loss": 0.2965,
"step": 4287
},
{
"epoch": 2.1196391051785937,
"grad_norm": 0.11874242067359779,
"learning_rate": 9.090089384197798e-06,
"loss": 0.2874,
"step": 4288
},
{
"epoch": 2.1201334816462736,
"grad_norm": 0.11645545163686305,
"learning_rate": 9.086211713548537e-06,
"loss": 0.3065,
"step": 4289
},
{
"epoch": 2.120627858113954,
"grad_norm": 0.11955899165281451,
"learning_rate": 9.082334181451514e-06,
"loss": 0.2774,
"step": 4290
},
{
"epoch": 2.121122234581634,
"grad_norm": 0.1201597979079157,
"learning_rate": 9.078456788494654e-06,
"loss": 0.2939,
"step": 4291
},
{
"epoch": 2.121616611049314,
"grad_norm": 0.11971690315268914,
"learning_rate": 9.074579535265864e-06,
"loss": 0.2898,
"step": 4292
},
{
"epoch": 2.122110987516994,
"grad_norm": 0.12887201034763954,
"learning_rate": 9.070702422353033e-06,
"loss": 0.3017,
"step": 4293
},
{
"epoch": 2.1226053639846745,
"grad_norm": 0.12962286672457435,
"learning_rate": 9.066825450344022e-06,
"loss": 0.2923,
"step": 4294
},
{
"epoch": 2.1230997404523544,
"grad_norm": 0.12392101799275258,
"learning_rate": 9.062948619826673e-06,
"loss": 0.3107,
"step": 4295
},
{
"epoch": 2.1235941169200347,
"grad_norm": 0.12172626030911222,
"learning_rate": 9.059071931388808e-06,
"loss": 0.2917,
"step": 4296
},
{
"epoch": 2.1240884933877147,
"grad_norm": 0.1328652371588361,
"learning_rate": 9.055195385618221e-06,
"loss": 0.2992,
"step": 4297
},
{
"epoch": 2.124582869855395,
"grad_norm": 0.12491930534974222,
"learning_rate": 9.05131898310269e-06,
"loss": 0.3092,
"step": 4298
},
{
"epoch": 2.125077246323075,
"grad_norm": 0.1233631614622434,
"learning_rate": 9.047442724429977e-06,
"loss": 0.2996,
"step": 4299
},
{
"epoch": 2.1255716227907553,
"grad_norm": 0.12088373029749329,
"learning_rate": 9.043566610187812e-06,
"loss": 0.2911,
"step": 4300
},
{
"epoch": 2.126065999258435,
"grad_norm": 0.12377456160414127,
"learning_rate": 9.039690640963906e-06,
"loss": 0.287,
"step": 4301
},
{
"epoch": 2.1265603757261156,
"grad_norm": 0.12232120931344326,
"learning_rate": 9.035814817345951e-06,
"loss": 0.2966,
"step": 4302
},
{
"epoch": 2.1270547521937955,
"grad_norm": 0.1188600169183758,
"learning_rate": 9.03193913992161e-06,
"loss": 0.2816,
"step": 4303
},
{
"epoch": 2.127549128661476,
"grad_norm": 0.12896035652360407,
"learning_rate": 9.028063609278537e-06,
"loss": 0.3063,
"step": 4304
},
{
"epoch": 2.1280435051291557,
"grad_norm": 0.12221402313213513,
"learning_rate": 9.024188226004353e-06,
"loss": 0.2976,
"step": 4305
},
{
"epoch": 2.128537881596836,
"grad_norm": 0.11826594482110935,
"learning_rate": 9.020312990686654e-06,
"loss": 0.3003,
"step": 4306
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.12305634145021656,
"learning_rate": 9.016437903913022e-06,
"loss": 0.3265,
"step": 4307
},
{
"epoch": 2.1295266345321964,
"grad_norm": 0.11561702590727127,
"learning_rate": 9.012562966271014e-06,
"loss": 0.2984,
"step": 4308
},
{
"epoch": 2.1300210109998763,
"grad_norm": 0.11568245072204016,
"learning_rate": 9.00868817834816e-06,
"loss": 0.2966,
"step": 4309
},
{
"epoch": 2.1305153874675566,
"grad_norm": 0.11854884980135666,
"learning_rate": 9.004813540731976e-06,
"loss": 0.3053,
"step": 4310
},
{
"epoch": 2.1310097639352366,
"grad_norm": 0.12131660819986896,
"learning_rate": 9.000939054009947e-06,
"loss": 0.2734,
"step": 4311
},
{
"epoch": 2.131504140402917,
"grad_norm": 0.11856159479632294,
"learning_rate": 8.99706471876954e-06,
"loss": 0.2759,
"step": 4312
},
{
"epoch": 2.131998516870597,
"grad_norm": 0.1191562204228915,
"learning_rate": 8.993190535598196e-06,
"loss": 0.2931,
"step": 4313
},
{
"epoch": 2.132492893338277,
"grad_norm": 0.12718609979621737,
"learning_rate": 8.989316505083328e-06,
"loss": 0.2799,
"step": 4314
},
{
"epoch": 2.132987269805957,
"grad_norm": 0.11484175726781866,
"learning_rate": 8.985442627812345e-06,
"loss": 0.2776,
"step": 4315
},
{
"epoch": 2.1334816462736375,
"grad_norm": 0.11389859425462089,
"learning_rate": 8.981568904372612e-06,
"loss": 0.3015,
"step": 4316
},
{
"epoch": 2.1339760227413174,
"grad_norm": 0.12699296824061312,
"learning_rate": 8.977695335351479e-06,
"loss": 0.2941,
"step": 4317
},
{
"epoch": 2.1344703992089977,
"grad_norm": 0.11793054856048336,
"learning_rate": 8.973821921336273e-06,
"loss": 0.2875,
"step": 4318
},
{
"epoch": 2.1349647756766776,
"grad_norm": 0.12097295556910918,
"learning_rate": 8.969948662914297e-06,
"loss": 0.3047,
"step": 4319
},
{
"epoch": 2.135459152144358,
"grad_norm": 0.12796220357693616,
"learning_rate": 8.966075560672823e-06,
"loss": 0.3047,
"step": 4320
},
{
"epoch": 2.135953528612038,
"grad_norm": 0.12239410027066505,
"learning_rate": 8.962202615199116e-06,
"loss": 0.3119,
"step": 4321
},
{
"epoch": 2.1364479050797183,
"grad_norm": 0.11511654341194598,
"learning_rate": 8.958329827080406e-06,
"loss": 0.2941,
"step": 4322
},
{
"epoch": 2.136942281547398,
"grad_norm": 0.11812194906065858,
"learning_rate": 8.954457196903897e-06,
"loss": 0.2863,
"step": 4323
},
{
"epoch": 2.1374366580150785,
"grad_norm": 0.11987701937429256,
"learning_rate": 8.950584725256774e-06,
"loss": 0.298,
"step": 4324
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.12375465338805644,
"learning_rate": 8.946712412726193e-06,
"loss": 0.3066,
"step": 4325
},
{
"epoch": 2.138425410950439,
"grad_norm": 0.12191421597316775,
"learning_rate": 8.942840259899298e-06,
"loss": 0.3156,
"step": 4326
},
{
"epoch": 2.1389197874181187,
"grad_norm": 0.12153764337119227,
"learning_rate": 8.938968267363195e-06,
"loss": 0.2928,
"step": 4327
},
{
"epoch": 2.139414163885799,
"grad_norm": 0.12036835648631262,
"learning_rate": 8.93509643570497e-06,
"loss": 0.3087,
"step": 4328
},
{
"epoch": 2.139908540353479,
"grad_norm": 0.12144485738479552,
"learning_rate": 8.93122476551169e-06,
"loss": 0.3079,
"step": 4329
},
{
"epoch": 2.1404029168211594,
"grad_norm": 0.11938346078173115,
"learning_rate": 8.927353257370388e-06,
"loss": 0.2945,
"step": 4330
},
{
"epoch": 2.1408972932888393,
"grad_norm": 0.23156203001258352,
"learning_rate": 8.923481911868078e-06,
"loss": 0.2839,
"step": 4331
},
{
"epoch": 2.1413916697565196,
"grad_norm": 0.11515250998494514,
"learning_rate": 8.919610729591754e-06,
"loss": 0.3012,
"step": 4332
},
{
"epoch": 2.1418860462241995,
"grad_norm": 0.4206912387990821,
"learning_rate": 8.915739711128376e-06,
"loss": 0.3763,
"step": 4333
},
{
"epoch": 2.14238042269188,
"grad_norm": 0.12235012366002333,
"learning_rate": 8.911868857064885e-06,
"loss": 0.2918,
"step": 4334
},
{
"epoch": 2.14287479915956,
"grad_norm": 0.13549212816887726,
"learning_rate": 8.907998167988195e-06,
"loss": 0.3191,
"step": 4335
},
{
"epoch": 2.14336917562724,
"grad_norm": 0.12040249538936093,
"learning_rate": 8.90412764448519e-06,
"loss": 0.2924,
"step": 4336
},
{
"epoch": 2.14386355209492,
"grad_norm": 0.12695648018093705,
"learning_rate": 8.900257287142744e-06,
"loss": 0.3275,
"step": 4337
},
{
"epoch": 2.1443579285626004,
"grad_norm": 0.12133031200327862,
"learning_rate": 8.896387096547693e-06,
"loss": 0.2913,
"step": 4338
},
{
"epoch": 2.1448523050302803,
"grad_norm": 0.12733888474257493,
"learning_rate": 8.892517073286847e-06,
"loss": 0.2936,
"step": 4339
},
{
"epoch": 2.1453466814979607,
"grad_norm": 0.12445634801393184,
"learning_rate": 8.888647217946997e-06,
"loss": 0.2965,
"step": 4340
},
{
"epoch": 2.145841057965641,
"grad_norm": 0.12072692570531782,
"learning_rate": 8.884777531114902e-06,
"loss": 0.3098,
"step": 4341
},
{
"epoch": 2.146335434433321,
"grad_norm": 0.11659371625844349,
"learning_rate": 8.880908013377307e-06,
"loss": 0.2855,
"step": 4342
},
{
"epoch": 2.146829810901001,
"grad_norm": 0.11990134905136646,
"learning_rate": 8.877038665320918e-06,
"loss": 0.3101,
"step": 4343
},
{
"epoch": 2.1473241873686812,
"grad_norm": 0.1273173745977574,
"learning_rate": 8.873169487532425e-06,
"loss": 0.2987,
"step": 4344
},
{
"epoch": 2.1478185638363616,
"grad_norm": 0.12837563189683426,
"learning_rate": 8.869300480598486e-06,
"loss": 0.3079,
"step": 4345
},
{
"epoch": 2.1483129403040415,
"grad_norm": 0.11922840542011955,
"learning_rate": 8.865431645105734e-06,
"loss": 0.3034,
"step": 4346
},
{
"epoch": 2.1488073167717214,
"grad_norm": 0.12224065284517835,
"learning_rate": 8.861562981640776e-06,
"loss": 0.3016,
"step": 4347
},
{
"epoch": 2.149301693239402,
"grad_norm": 0.11689486927277112,
"learning_rate": 8.8576944907902e-06,
"loss": 0.3032,
"step": 4348
},
{
"epoch": 2.149796069707082,
"grad_norm": 0.11491914206384325,
"learning_rate": 8.853826173140559e-06,
"loss": 0.2886,
"step": 4349
},
{
"epoch": 2.150290446174762,
"grad_norm": 0.12021826643892156,
"learning_rate": 8.849958029278383e-06,
"loss": 0.3187,
"step": 4350
},
{
"epoch": 2.1507848226424424,
"grad_norm": 0.12005233722511918,
"learning_rate": 8.846090059790176e-06,
"loss": 0.2979,
"step": 4351
},
{
"epoch": 2.1512791991101223,
"grad_norm": 0.1307504881840007,
"learning_rate": 8.84222226526241e-06,
"loss": 0.3034,
"step": 4352
},
{
"epoch": 2.1517735755778027,
"grad_norm": 0.11952684995782233,
"learning_rate": 8.838354646281544e-06,
"loss": 0.2883,
"step": 4353
},
{
"epoch": 2.1522679520454826,
"grad_norm": 0.11727405625316598,
"learning_rate": 8.834487203433998e-06,
"loss": 0.2795,
"step": 4354
},
{
"epoch": 2.152762328513163,
"grad_norm": 0.11395436836400705,
"learning_rate": 8.830619937306168e-06,
"loss": 0.2917,
"step": 4355
},
{
"epoch": 2.153256704980843,
"grad_norm": 0.12790427959707829,
"learning_rate": 8.826752848484425e-06,
"loss": 0.3023,
"step": 4356
},
{
"epoch": 2.1537510814485232,
"grad_norm": 0.11975477818436236,
"learning_rate": 8.822885937555113e-06,
"loss": 0.2891,
"step": 4357
},
{
"epoch": 2.154245457916203,
"grad_norm": 0.1227643798497565,
"learning_rate": 8.819019205104544e-06,
"loss": 0.2722,
"step": 4358
},
{
"epoch": 2.1547398343838835,
"grad_norm": 0.11576186815744485,
"learning_rate": 8.815152651719015e-06,
"loss": 0.287,
"step": 4359
},
{
"epoch": 2.1552342108515634,
"grad_norm": 0.13013267528040715,
"learning_rate": 8.811286277984785e-06,
"loss": 0.2859,
"step": 4360
},
{
"epoch": 2.1557285873192438,
"grad_norm": 0.11141675005209407,
"learning_rate": 8.807420084488092e-06,
"loss": 0.2938,
"step": 4361
},
{
"epoch": 2.1562229637869237,
"grad_norm": 0.13206861946818838,
"learning_rate": 8.803554071815139e-06,
"loss": 0.2717,
"step": 4362
},
{
"epoch": 2.156717340254604,
"grad_norm": 0.2067522087241948,
"learning_rate": 8.799688240552102e-06,
"loss": 0.3297,
"step": 4363
},
{
"epoch": 2.157211716722284,
"grad_norm": 0.12362815926287145,
"learning_rate": 8.795822591285147e-06,
"loss": 0.2855,
"step": 4364
},
{
"epoch": 2.1577060931899643,
"grad_norm": 0.11358579577032335,
"learning_rate": 8.79195712460039e-06,
"loss": 0.2951,
"step": 4365
},
{
"epoch": 2.1582004696576442,
"grad_norm": 0.12131334051420864,
"learning_rate": 8.788091841083932e-06,
"loss": 0.2988,
"step": 4366
},
{
"epoch": 2.1586948461253246,
"grad_norm": 0.12276716750057468,
"learning_rate": 8.78422674132184e-06,
"loss": 0.3191,
"step": 4367
},
{
"epoch": 2.1591892225930045,
"grad_norm": 0.1347480501177906,
"learning_rate": 8.78036182590016e-06,
"loss": 0.2859,
"step": 4368
},
{
"epoch": 2.159683599060685,
"grad_norm": 0.1165391599619833,
"learning_rate": 8.776497095404897e-06,
"loss": 0.2755,
"step": 4369
},
{
"epoch": 2.1601779755283648,
"grad_norm": 0.12212069421173748,
"learning_rate": 8.772632550422047e-06,
"loss": 0.289,
"step": 4370
},
{
"epoch": 2.160672351996045,
"grad_norm": 0.12555281722971035,
"learning_rate": 8.768768191537565e-06,
"loss": 0.2977,
"step": 4371
},
{
"epoch": 2.161166728463725,
"grad_norm": 0.1172722450492924,
"learning_rate": 8.764904019337378e-06,
"loss": 0.2961,
"step": 4372
},
{
"epoch": 2.1616611049314054,
"grad_norm": 0.12831179793634337,
"learning_rate": 8.76104003440739e-06,
"loss": 0.2968,
"step": 4373
},
{
"epoch": 2.1621554813990853,
"grad_norm": 0.12011243236812383,
"learning_rate": 8.75717623733347e-06,
"loss": 0.2744,
"step": 4374
},
{
"epoch": 2.1626498578667657,
"grad_norm": 0.12832418730558112,
"learning_rate": 8.753312628701468e-06,
"loss": 0.2729,
"step": 4375
},
{
"epoch": 2.1631442343344456,
"grad_norm": 0.11985490899630907,
"learning_rate": 8.749449209097197e-06,
"loss": 0.2845,
"step": 4376
},
{
"epoch": 2.163638610802126,
"grad_norm": 0.12707336240735093,
"learning_rate": 8.745585979106443e-06,
"loss": 0.2936,
"step": 4377
},
{
"epoch": 2.164132987269806,
"grad_norm": 0.12180026854906112,
"learning_rate": 8.741722939314967e-06,
"loss": 0.3001,
"step": 4378
},
{
"epoch": 2.164627363737486,
"grad_norm": 0.13394072527396936,
"learning_rate": 8.737860090308495e-06,
"loss": 0.2875,
"step": 4379
},
{
"epoch": 2.165121740205166,
"grad_norm": 0.12188914249274846,
"learning_rate": 8.733997432672729e-06,
"loss": 0.2982,
"step": 4380
},
{
"epoch": 2.1656161166728465,
"grad_norm": 0.12072519199247655,
"learning_rate": 8.730134966993342e-06,
"loss": 0.2762,
"step": 4381
},
{
"epoch": 2.1661104931405264,
"grad_norm": 0.13209903813006477,
"learning_rate": 8.726272693855976e-06,
"loss": 0.2926,
"step": 4382
},
{
"epoch": 2.1666048696082068,
"grad_norm": 0.13317288448988393,
"learning_rate": 8.722410613846244e-06,
"loss": 0.323,
"step": 4383
},
{
"epoch": 2.1670992460758867,
"grad_norm": 0.14121511576438298,
"learning_rate": 8.71854872754973e-06,
"loss": 0.3128,
"step": 4384
},
{
"epoch": 2.167593622543567,
"grad_norm": 0.129225444192718,
"learning_rate": 8.714687035551988e-06,
"loss": 0.2919,
"step": 4385
},
{
"epoch": 2.168087999011247,
"grad_norm": 0.1309596945877879,
"learning_rate": 8.710825538438544e-06,
"loss": 0.3164,
"step": 4386
},
{
"epoch": 2.1685823754789273,
"grad_norm": 0.1258421209789173,
"learning_rate": 8.706964236794897e-06,
"loss": 0.3176,
"step": 4387
},
{
"epoch": 2.169076751946607,
"grad_norm": 0.1261959338985848,
"learning_rate": 8.703103131206508e-06,
"loss": 0.3015,
"step": 4388
},
{
"epoch": 2.1695711284142876,
"grad_norm": 0.13218605737469535,
"learning_rate": 8.699242222258814e-06,
"loss": 0.2931,
"step": 4389
},
{
"epoch": 2.1700655048819675,
"grad_norm": 0.11707416653685859,
"learning_rate": 8.695381510537221e-06,
"loss": 0.2967,
"step": 4390
},
{
"epoch": 2.170559881349648,
"grad_norm": 0.12107428605041139,
"learning_rate": 8.691520996627107e-06,
"loss": 0.2872,
"step": 4391
},
{
"epoch": 2.1710542578173277,
"grad_norm": 0.11833647118481866,
"learning_rate": 8.68766068111382e-06,
"loss": 0.3051,
"step": 4392
},
{
"epoch": 2.171548634285008,
"grad_norm": 0.12355880733844381,
"learning_rate": 8.683800564582675e-06,
"loss": 0.2939,
"step": 4393
},
{
"epoch": 2.172043010752688,
"grad_norm": 0.1266893516292159,
"learning_rate": 8.679940647618961e-06,
"loss": 0.3039,
"step": 4394
},
{
"epoch": 2.1725373872203684,
"grad_norm": 0.1300588009145129,
"learning_rate": 8.676080930807928e-06,
"loss": 0.3092,
"step": 4395
},
{
"epoch": 2.1730317636880483,
"grad_norm": 0.13469186181747028,
"learning_rate": 8.672221414734802e-06,
"loss": 0.3097,
"step": 4396
},
{
"epoch": 2.1735261401557286,
"grad_norm": 0.13527906452192195,
"learning_rate": 8.668362099984786e-06,
"loss": 0.2888,
"step": 4397
},
{
"epoch": 2.1740205166234086,
"grad_norm": 0.13521387805665785,
"learning_rate": 8.66450298714304e-06,
"loss": 0.2767,
"step": 4398
},
{
"epoch": 2.174514893091089,
"grad_norm": 0.12553653407001122,
"learning_rate": 8.660644076794699e-06,
"loss": 0.2997,
"step": 4399
},
{
"epoch": 2.175009269558769,
"grad_norm": 0.13107578318145643,
"learning_rate": 8.656785369524864e-06,
"loss": 0.3115,
"step": 4400
},
{
"epoch": 2.175503646026449,
"grad_norm": 0.11993639015488294,
"learning_rate": 8.652926865918613e-06,
"loss": 0.3025,
"step": 4401
},
{
"epoch": 2.175998022494129,
"grad_norm": 0.13289670819451832,
"learning_rate": 8.649068566560976e-06,
"loss": 0.2911,
"step": 4402
},
{
"epoch": 2.1764923989618095,
"grad_norm": 0.11727703230214281,
"learning_rate": 8.645210472036978e-06,
"loss": 0.3021,
"step": 4403
},
{
"epoch": 2.1769867754294894,
"grad_norm": 0.12019273505588626,
"learning_rate": 8.641352582931593e-06,
"loss": 0.2873,
"step": 4404
},
{
"epoch": 2.1774811518971697,
"grad_norm": 0.12646064712207725,
"learning_rate": 8.637494899829768e-06,
"loss": 0.2772,
"step": 4405
},
{
"epoch": 2.1779755283648496,
"grad_norm": 0.12606880823266944,
"learning_rate": 8.633637423316422e-06,
"loss": 0.3125,
"step": 4406
},
{
"epoch": 2.17846990483253,
"grad_norm": 0.12689943066183115,
"learning_rate": 8.629780153976438e-06,
"loss": 0.3218,
"step": 4407
},
{
"epoch": 2.17896428130021,
"grad_norm": 0.12690647321613818,
"learning_rate": 8.625923092394675e-06,
"loss": 0.2878,
"step": 4408
},
{
"epoch": 2.1794586577678903,
"grad_norm": 0.12974148372408775,
"learning_rate": 8.622066239155957e-06,
"loss": 0.3091,
"step": 4409
},
{
"epoch": 2.17995303423557,
"grad_norm": 0.1189594481611365,
"learning_rate": 8.61820959484507e-06,
"loss": 0.2967,
"step": 4410
},
{
"epoch": 2.1804474107032505,
"grad_norm": 0.12164623713133751,
"learning_rate": 8.61435316004678e-06,
"loss": 0.2652,
"step": 4411
},
{
"epoch": 2.1809417871709305,
"grad_norm": 0.11662829293788712,
"learning_rate": 8.610496935345811e-06,
"loss": 0.2862,
"step": 4412
},
{
"epoch": 2.181436163638611,
"grad_norm": 0.1296524799987129,
"learning_rate": 8.606640921326855e-06,
"loss": 0.2951,
"step": 4413
},
{
"epoch": 2.1819305401062907,
"grad_norm": 0.11996254142013846,
"learning_rate": 8.602785118574586e-06,
"loss": 0.2918,
"step": 4414
},
{
"epoch": 2.182424916573971,
"grad_norm": 0.12217271033873195,
"learning_rate": 8.598929527673631e-06,
"loss": 0.2903,
"step": 4415
},
{
"epoch": 2.1829192930416514,
"grad_norm": 0.12425353214165782,
"learning_rate": 8.595074149208591e-06,
"loss": 0.3099,
"step": 4416
},
{
"epoch": 2.1834136695093314,
"grad_norm": 0.12058961047680686,
"learning_rate": 8.591218983764036e-06,
"loss": 0.2978,
"step": 4417
},
{
"epoch": 2.1839080459770113,
"grad_norm": 0.12633708982631295,
"learning_rate": 8.587364031924492e-06,
"loss": 0.3017,
"step": 4418
},
{
"epoch": 2.1844024224446916,
"grad_norm": 0.1235976898412053,
"learning_rate": 8.583509294274474e-06,
"loss": 0.3279,
"step": 4419
},
{
"epoch": 2.184896798912372,
"grad_norm": 0.126206117814767,
"learning_rate": 8.57965477139845e-06,
"loss": 0.2829,
"step": 4420
},
{
"epoch": 2.185391175380052,
"grad_norm": 0.11616630518626739,
"learning_rate": 8.575800463880856e-06,
"loss": 0.323,
"step": 4421
},
{
"epoch": 2.1858855518477323,
"grad_norm": 0.12351983390199839,
"learning_rate": 8.571946372306097e-06,
"loss": 0.305,
"step": 4422
},
{
"epoch": 2.186379928315412,
"grad_norm": 0.12424536884527003,
"learning_rate": 8.568092497258544e-06,
"loss": 0.301,
"step": 4423
},
{
"epoch": 2.1868743047830925,
"grad_norm": 0.12493255005959143,
"learning_rate": 8.564238839322544e-06,
"loss": 0.2898,
"step": 4424
},
{
"epoch": 2.1873686812507724,
"grad_norm": 0.1224815949817496,
"learning_rate": 8.560385399082398e-06,
"loss": 0.2932,
"step": 4425
},
{
"epoch": 2.187863057718453,
"grad_norm": 0.11870654791782008,
"learning_rate": 8.556532177122383e-06,
"loss": 0.2712,
"step": 4426
},
{
"epoch": 2.1883574341861327,
"grad_norm": 0.12221717381198226,
"learning_rate": 8.55267917402674e-06,
"loss": 0.2985,
"step": 4427
},
{
"epoch": 2.188851810653813,
"grad_norm": 0.12049949603063717,
"learning_rate": 8.548826390379674e-06,
"loss": 0.2835,
"step": 4428
},
{
"epoch": 2.189346187121493,
"grad_norm": 0.11659322306063652,
"learning_rate": 8.54497382676536e-06,
"loss": 0.2761,
"step": 4429
},
{
"epoch": 2.1898405635891733,
"grad_norm": 0.11689578830157672,
"learning_rate": 8.54112148376794e-06,
"loss": 0.2978,
"step": 4430
},
{
"epoch": 2.1903349400568533,
"grad_norm": 0.12161320422001565,
"learning_rate": 8.537269361971523e-06,
"loss": 0.29,
"step": 4431
},
{
"epoch": 2.1908293165245336,
"grad_norm": 0.12173844360993068,
"learning_rate": 8.533417461960182e-06,
"loss": 0.2813,
"step": 4432
},
{
"epoch": 2.1913236929922135,
"grad_norm": 0.11850602475811557,
"learning_rate": 8.529565784317958e-06,
"loss": 0.296,
"step": 4433
},
{
"epoch": 2.191818069459894,
"grad_norm": 0.16700979781654504,
"learning_rate": 8.525714329628855e-06,
"loss": 0.3074,
"step": 4434
},
{
"epoch": 2.192312445927574,
"grad_norm": 0.13223287099866132,
"learning_rate": 8.521863098476851e-06,
"loss": 0.3206,
"step": 4435
},
{
"epoch": 2.192806822395254,
"grad_norm": 2.06778307278071,
"learning_rate": 8.518012091445884e-06,
"loss": 0.3124,
"step": 4436
},
{
"epoch": 2.193301198862934,
"grad_norm": 0.13912602865582674,
"learning_rate": 8.514161309119853e-06,
"loss": 0.3169,
"step": 4437
},
{
"epoch": 2.1937955753306144,
"grad_norm": 0.1599967071890661,
"learning_rate": 8.510310752082635e-06,
"loss": 0.3029,
"step": 4438
},
{
"epoch": 2.1942899517982943,
"grad_norm": 0.1147638994521793,
"learning_rate": 8.506460420918067e-06,
"loss": 0.2958,
"step": 4439
},
{
"epoch": 2.1947843282659747,
"grad_norm": 0.12435545234174913,
"learning_rate": 8.502610316209947e-06,
"loss": 0.2896,
"step": 4440
},
{
"epoch": 2.1952787047336546,
"grad_norm": 0.1261902118311915,
"learning_rate": 8.498760438542048e-06,
"loss": 0.2866,
"step": 4441
},
{
"epoch": 2.195773081201335,
"grad_norm": 0.12388421808727543,
"learning_rate": 8.494910788498101e-06,
"loss": 0.3109,
"step": 4442
},
{
"epoch": 2.196267457669015,
"grad_norm": 0.11942822820531651,
"learning_rate": 8.49106136666181e-06,
"loss": 0.3041,
"step": 4443
},
{
"epoch": 2.1967618341366952,
"grad_norm": 0.12956382076301684,
"learning_rate": 8.487212173616835e-06,
"loss": 0.3017,
"step": 4444
},
{
"epoch": 2.197256210604375,
"grad_norm": 0.12472080109899499,
"learning_rate": 8.4833632099468e-06,
"loss": 0.2858,
"step": 4445
},
{
"epoch": 2.1977505870720555,
"grad_norm": 0.12200850034014314,
"learning_rate": 8.479514476235317e-06,
"loss": 0.3003,
"step": 4446
},
{
"epoch": 2.1982449635397354,
"grad_norm": 0.12574705397113128,
"learning_rate": 8.475665973065934e-06,
"loss": 0.3025,
"step": 4447
},
{
"epoch": 2.1987393400074158,
"grad_norm": 0.1365406838996783,
"learning_rate": 8.47181770102218e-06,
"loss": 0.3169,
"step": 4448
},
{
"epoch": 2.1992337164750957,
"grad_norm": 0.11845153970607293,
"learning_rate": 8.467969660687543e-06,
"loss": 0.2841,
"step": 4449
},
{
"epoch": 2.199728092942776,
"grad_norm": 0.12375505351216744,
"learning_rate": 8.464121852645484e-06,
"loss": 0.304,
"step": 4450
},
{
"epoch": 2.200222469410456,
"grad_norm": 0.12824574122004764,
"learning_rate": 8.460274277479413e-06,
"loss": 0.3044,
"step": 4451
},
{
"epoch": 2.2007168458781363,
"grad_norm": 0.12202627116734299,
"learning_rate": 8.456426935772724e-06,
"loss": 0.3118,
"step": 4452
},
{
"epoch": 2.2012112223458162,
"grad_norm": 0.12146150180357929,
"learning_rate": 8.452579828108766e-06,
"loss": 0.3032,
"step": 4453
},
{
"epoch": 2.2017055988134966,
"grad_norm": 0.11987536560279315,
"learning_rate": 8.448732955070848e-06,
"loss": 0.3003,
"step": 4454
},
{
"epoch": 2.2021999752811765,
"grad_norm": 0.12176435673503361,
"learning_rate": 8.444886317242251e-06,
"loss": 0.2777,
"step": 4455
},
{
"epoch": 2.202694351748857,
"grad_norm": 0.11760308185397841,
"learning_rate": 8.441039915206215e-06,
"loss": 0.3029,
"step": 4456
},
{
"epoch": 2.2031887282165368,
"grad_norm": 0.12530677561488532,
"learning_rate": 8.43719374954595e-06,
"loss": 0.3004,
"step": 4457
},
{
"epoch": 2.203683104684217,
"grad_norm": 0.12090539572141795,
"learning_rate": 8.433347820844628e-06,
"loss": 0.2958,
"step": 4458
},
{
"epoch": 2.204177481151897,
"grad_norm": 0.1238990735516753,
"learning_rate": 8.429502129685381e-06,
"loss": 0.291,
"step": 4459
},
{
"epoch": 2.2046718576195774,
"grad_norm": 0.12126676977001777,
"learning_rate": 8.42565667665131e-06,
"loss": 0.3034,
"step": 4460
},
{
"epoch": 2.2051662340872573,
"grad_norm": 0.11912714598416313,
"learning_rate": 8.421811462325478e-06,
"loss": 0.2859,
"step": 4461
},
{
"epoch": 2.2056606105549377,
"grad_norm": 0.12335898099312095,
"learning_rate": 8.417966487290906e-06,
"loss": 0.3057,
"step": 4462
},
{
"epoch": 2.2061549870226176,
"grad_norm": 0.11894053927422724,
"learning_rate": 8.414121752130594e-06,
"loss": 0.3024,
"step": 4463
},
{
"epoch": 2.206649363490298,
"grad_norm": 0.12350348257201467,
"learning_rate": 8.41027725742749e-06,
"loss": 0.2948,
"step": 4464
},
{
"epoch": 2.207143739957978,
"grad_norm": 0.11797080278449401,
"learning_rate": 8.406433003764514e-06,
"loss": 0.3028,
"step": 4465
},
{
"epoch": 2.207638116425658,
"grad_norm": 0.11983821297310163,
"learning_rate": 8.402588991724545e-06,
"loss": 0.2987,
"step": 4466
},
{
"epoch": 2.208132492893338,
"grad_norm": 0.11983667979195972,
"learning_rate": 8.39874522189043e-06,
"loss": 0.313,
"step": 4467
},
{
"epoch": 2.2086268693610185,
"grad_norm": 0.11695863695416922,
"learning_rate": 8.394901694844975e-06,
"loss": 0.316,
"step": 4468
},
{
"epoch": 2.2091212458286984,
"grad_norm": 0.12716745562863438,
"learning_rate": 8.391058411170957e-06,
"loss": 0.3451,
"step": 4469
},
{
"epoch": 2.2096156222963788,
"grad_norm": 0.11976622310729033,
"learning_rate": 8.387215371451099e-06,
"loss": 0.3072,
"step": 4470
},
{
"epoch": 2.2101099987640587,
"grad_norm": 0.11451147943608882,
"learning_rate": 8.383372576268107e-06,
"loss": 0.2733,
"step": 4471
},
{
"epoch": 2.210604375231739,
"grad_norm": 0.12296586654473671,
"learning_rate": 8.379530026204635e-06,
"loss": 0.2984,
"step": 4472
},
{
"epoch": 2.211098751699419,
"grad_norm": 0.13111629132713076,
"learning_rate": 8.375687721843308e-06,
"loss": 0.3122,
"step": 4473
},
{
"epoch": 2.2115931281670993,
"grad_norm": 0.12250033421928241,
"learning_rate": 8.371845663766715e-06,
"loss": 0.2973,
"step": 4474
},
{
"epoch": 2.212087504634779,
"grad_norm": 0.1259481177077571,
"learning_rate": 8.3680038525574e-06,
"loss": 0.295,
"step": 4475
},
{
"epoch": 2.2125818811024596,
"grad_norm": 0.11730812837164194,
"learning_rate": 8.364162288797879e-06,
"loss": 0.2863,
"step": 4476
},
{
"epoch": 2.2130762575701395,
"grad_norm": 0.1256674345825462,
"learning_rate": 8.360320973070618e-06,
"loss": 0.2905,
"step": 4477
},
{
"epoch": 2.21357063403782,
"grad_norm": 0.11900502520091591,
"learning_rate": 8.356479905958053e-06,
"loss": 0.3038,
"step": 4478
},
{
"epoch": 2.2140650105054998,
"grad_norm": 0.12506380710441847,
"learning_rate": 8.35263908804259e-06,
"loss": 0.3013,
"step": 4479
},
{
"epoch": 2.21455938697318,
"grad_norm": 0.11862425904645237,
"learning_rate": 8.348798519906583e-06,
"loss": 0.2883,
"step": 4480
},
{
"epoch": 2.21505376344086,
"grad_norm": 0.12570373421782577,
"learning_rate": 8.344958202132357e-06,
"loss": 0.3053,
"step": 4481
},
{
"epoch": 2.2155481399085404,
"grad_norm": 0.12317486220588665,
"learning_rate": 8.341118135302193e-06,
"loss": 0.295,
"step": 4482
},
{
"epoch": 2.2160425163762203,
"grad_norm": 0.1157004869495131,
"learning_rate": 8.337278319998343e-06,
"loss": 0.2934,
"step": 4483
},
{
"epoch": 2.2165368928439007,
"grad_norm": 0.12142876806166664,
"learning_rate": 8.333438756803004e-06,
"loss": 0.2805,
"step": 4484
},
{
"epoch": 2.2170312693115806,
"grad_norm": 0.12158457426492293,
"learning_rate": 8.32959944629836e-06,
"loss": 0.3098,
"step": 4485
},
{
"epoch": 2.217525645779261,
"grad_norm": 0.12976756373605175,
"learning_rate": 8.325760389066535e-06,
"loss": 0.3039,
"step": 4486
},
{
"epoch": 2.2180200222469413,
"grad_norm": 0.13647260235079786,
"learning_rate": 8.321921585689623e-06,
"loss": 0.3245,
"step": 4487
},
{
"epoch": 2.218514398714621,
"grad_norm": 0.12328737478600772,
"learning_rate": 8.318083036749677e-06,
"loss": 0.2756,
"step": 4488
},
{
"epoch": 2.219008775182301,
"grad_norm": 0.12754845012065996,
"learning_rate": 8.314244742828716e-06,
"loss": 0.3034,
"step": 4489
},
{
"epoch": 2.2195031516499815,
"grad_norm": 0.118429041401148,
"learning_rate": 8.310406704508718e-06,
"loss": 0.3105,
"step": 4490
},
{
"epoch": 2.219997528117662,
"grad_norm": 0.12687742591339876,
"learning_rate": 8.30656892237162e-06,
"loss": 0.2878,
"step": 4491
},
{
"epoch": 2.2204919045853417,
"grad_norm": 0.12534421153288677,
"learning_rate": 8.302731396999324e-06,
"loss": 0.2915,
"step": 4492
},
{
"epoch": 2.2209862810530216,
"grad_norm": 0.1197888059948552,
"learning_rate": 8.298894128973688e-06,
"loss": 0.305,
"step": 4493
},
{
"epoch": 2.221480657520702,
"grad_norm": 0.11845477061094961,
"learning_rate": 8.295057118876536e-06,
"loss": 0.2791,
"step": 4494
},
{
"epoch": 2.2219750339883824,
"grad_norm": 0.12165646955247911,
"learning_rate": 8.291220367289648e-06,
"loss": 0.2999,
"step": 4495
},
{
"epoch": 2.2224694104560623,
"grad_norm": 0.1277317525709378,
"learning_rate": 8.28738387479477e-06,
"loss": 0.3107,
"step": 4496
},
{
"epoch": 2.2229637869237426,
"grad_norm": 0.1241218696394546,
"learning_rate": 8.283547641973606e-06,
"loss": 0.2808,
"step": 4497
},
{
"epoch": 2.2234581633914225,
"grad_norm": 0.12202953946297174,
"learning_rate": 8.279711669407822e-06,
"loss": 0.3064,
"step": 4498
},
{
"epoch": 2.223952539859103,
"grad_norm": 0.13028633036331502,
"learning_rate": 8.275875957679045e-06,
"loss": 0.3533,
"step": 4499
},
{
"epoch": 2.224446916326783,
"grad_norm": 0.1347088731080458,
"learning_rate": 8.272040507368852e-06,
"loss": 0.2829,
"step": 4500
},
{
"epoch": 2.224941292794463,
"grad_norm": 0.12002206537990497,
"learning_rate": 8.2682053190588e-06,
"loss": 0.3108,
"step": 4501
},
{
"epoch": 2.225435669262143,
"grad_norm": 0.12026442136934043,
"learning_rate": 8.264370393330394e-06,
"loss": 0.3089,
"step": 4502
},
{
"epoch": 2.2259300457298234,
"grad_norm": 0.12101171132724683,
"learning_rate": 8.260535730765096e-06,
"loss": 0.2801,
"step": 4503
},
{
"epoch": 2.2264244221975034,
"grad_norm": 0.11843956398063746,
"learning_rate": 8.256701331944334e-06,
"loss": 0.2892,
"step": 4504
},
{
"epoch": 2.2269187986651837,
"grad_norm": 0.11881098775041925,
"learning_rate": 8.252867197449496e-06,
"loss": 0.3063,
"step": 4505
},
{
"epoch": 2.2274131751328636,
"grad_norm": 0.13718639225614643,
"learning_rate": 8.24903332786193e-06,
"loss": 0.2885,
"step": 4506
},
{
"epoch": 2.227907551600544,
"grad_norm": 0.1209076748260821,
"learning_rate": 8.24519972376294e-06,
"loss": 0.2792,
"step": 4507
},
{
"epoch": 2.228401928068224,
"grad_norm": 0.12225408153380739,
"learning_rate": 8.241366385733797e-06,
"loss": 0.3085,
"step": 4508
},
{
"epoch": 2.2288963045359043,
"grad_norm": 0.11897191368118917,
"learning_rate": 8.237533314355725e-06,
"loss": 0.3028,
"step": 4509
},
{
"epoch": 2.229390681003584,
"grad_norm": 0.11975349118134354,
"learning_rate": 8.233700510209905e-06,
"loss": 0.303,
"step": 4510
},
{
"epoch": 2.2298850574712645,
"grad_norm": 0.11620291643960791,
"learning_rate": 8.229867973877485e-06,
"loss": 0.2785,
"step": 4511
},
{
"epoch": 2.2303794339389444,
"grad_norm": 0.12206176581585262,
"learning_rate": 8.226035705939572e-06,
"loss": 0.2867,
"step": 4512
},
{
"epoch": 2.230873810406625,
"grad_norm": 0.12235905036258107,
"learning_rate": 8.222203706977229e-06,
"loss": 0.2827,
"step": 4513
},
{
"epoch": 2.2313681868743047,
"grad_norm": 0.11436129769248803,
"learning_rate": 8.218371977571476e-06,
"loss": 0.298,
"step": 4514
},
{
"epoch": 2.231862563341985,
"grad_norm": 0.12660705551493845,
"learning_rate": 8.2145405183033e-06,
"loss": 0.3216,
"step": 4515
},
{
"epoch": 2.232356939809665,
"grad_norm": 0.11663185293926814,
"learning_rate": 8.210709329753635e-06,
"loss": 0.283,
"step": 4516
},
{
"epoch": 2.2328513162773453,
"grad_norm": 0.11276580683392065,
"learning_rate": 8.20687841250339e-06,
"loss": 0.2746,
"step": 4517
},
{
"epoch": 2.2333456927450253,
"grad_norm": 0.12429959814793148,
"learning_rate": 8.20304776713342e-06,
"loss": 0.283,
"step": 4518
},
{
"epoch": 2.2338400692127056,
"grad_norm": 0.12235358733301835,
"learning_rate": 8.19921739422454e-06,
"loss": 0.3088,
"step": 4519
},
{
"epoch": 2.2343344456803855,
"grad_norm": 0.12317985120702334,
"learning_rate": 8.19538729435753e-06,
"loss": 0.3055,
"step": 4520
},
{
"epoch": 2.234828822148066,
"grad_norm": 0.12877590054061983,
"learning_rate": 8.191557468113123e-06,
"loss": 0.31,
"step": 4521
},
{
"epoch": 2.235323198615746,
"grad_norm": 14.290710241043,
"learning_rate": 8.187727916072013e-06,
"loss": 0.6906,
"step": 4522
},
{
"epoch": 2.235817575083426,
"grad_norm": 0.1263792636418845,
"learning_rate": 8.183898638814852e-06,
"loss": 0.2924,
"step": 4523
},
{
"epoch": 2.236311951551106,
"grad_norm": 0.13753072066150898,
"learning_rate": 8.180069636922252e-06,
"loss": 0.2941,
"step": 4524
},
{
"epoch": 2.2368063280187864,
"grad_norm": 0.12731678833439636,
"learning_rate": 8.176240910974784e-06,
"loss": 0.2827,
"step": 4525
},
{
"epoch": 2.2373007044864663,
"grad_norm": 0.11944922492586488,
"learning_rate": 8.172412461552967e-06,
"loss": 0.3081,
"step": 4526
},
{
"epoch": 2.2377950809541467,
"grad_norm": 0.13079394017158896,
"learning_rate": 8.168584289237289e-06,
"loss": 0.303,
"step": 4527
},
{
"epoch": 2.2382894574218266,
"grad_norm": 0.13405390595095443,
"learning_rate": 8.164756394608198e-06,
"loss": 0.2843,
"step": 4528
},
{
"epoch": 2.238783833889507,
"grad_norm": 0.1274060452546802,
"learning_rate": 8.16092877824609e-06,
"loss": 0.3105,
"step": 4529
},
{
"epoch": 2.239278210357187,
"grad_norm": 0.12564732037532872,
"learning_rate": 8.15710144073132e-06,
"loss": 0.3092,
"step": 4530
},
{
"epoch": 2.2397725868248672,
"grad_norm": 0.11865942882541085,
"learning_rate": 8.153274382644213e-06,
"loss": 0.2961,
"step": 4531
},
{
"epoch": 2.240266963292547,
"grad_norm": 0.12421802054276623,
"learning_rate": 8.149447604565038e-06,
"loss": 0.3043,
"step": 4532
},
{
"epoch": 2.2407613397602275,
"grad_norm": 0.13940636502311704,
"learning_rate": 8.14562110707402e-06,
"loss": 0.3,
"step": 4533
},
{
"epoch": 2.2412557162279074,
"grad_norm": 0.12711428161057403,
"learning_rate": 8.141794890751361e-06,
"loss": 0.2895,
"step": 4534
},
{
"epoch": 2.241750092695588,
"grad_norm": 0.12291863813042778,
"learning_rate": 8.137968956177201e-06,
"loss": 0.3171,
"step": 4535
},
{
"epoch": 2.2422444691632677,
"grad_norm": 0.1341645475093456,
"learning_rate": 8.134143303931642e-06,
"loss": 0.325,
"step": 4536
},
{
"epoch": 2.242738845630948,
"grad_norm": 0.12635370862573922,
"learning_rate": 8.130317934594747e-06,
"loss": 0.3331,
"step": 4537
},
{
"epoch": 2.243233222098628,
"grad_norm": 0.12946511775161051,
"learning_rate": 8.12649284874653e-06,
"loss": 0.2967,
"step": 4538
},
{
"epoch": 2.2437275985663083,
"grad_norm": 0.12206738252476483,
"learning_rate": 8.122668046966969e-06,
"loss": 0.2965,
"step": 4539
},
{
"epoch": 2.2442219750339882,
"grad_norm": 0.12042404147146056,
"learning_rate": 8.118843529835995e-06,
"loss": 0.2995,
"step": 4540
},
{
"epoch": 2.2447163515016686,
"grad_norm": 0.12369600146180929,
"learning_rate": 8.1150192979335e-06,
"loss": 0.2951,
"step": 4541
},
{
"epoch": 2.2452107279693485,
"grad_norm": 0.12614474230755676,
"learning_rate": 8.111195351839327e-06,
"loss": 0.2911,
"step": 4542
},
{
"epoch": 2.245705104437029,
"grad_norm": 0.12215693097920123,
"learning_rate": 8.107371692133276e-06,
"loss": 0.3082,
"step": 4543
},
{
"epoch": 2.246199480904709,
"grad_norm": 0.1261246333756444,
"learning_rate": 8.103548319395104e-06,
"loss": 0.3001,
"step": 4544
},
{
"epoch": 2.246693857372389,
"grad_norm": 0.1204839009912104,
"learning_rate": 8.09972523420453e-06,
"loss": 0.3062,
"step": 4545
},
{
"epoch": 2.247188233840069,
"grad_norm": 0.12484414711609497,
"learning_rate": 8.095902437141228e-06,
"loss": 0.301,
"step": 4546
},
{
"epoch": 2.2476826103077494,
"grad_norm": 0.12161403870553608,
"learning_rate": 8.09207992878482e-06,
"loss": 0.3148,
"step": 4547
},
{
"epoch": 2.2481769867754293,
"grad_norm": 0.12854551007976098,
"learning_rate": 8.088257709714892e-06,
"loss": 0.2861,
"step": 4548
},
{
"epoch": 2.2486713632431097,
"grad_norm": 0.11991947711777826,
"learning_rate": 8.084435780510983e-06,
"loss": 0.3008,
"step": 4549
},
{
"epoch": 2.2491657397107896,
"grad_norm": 0.12015827545423352,
"learning_rate": 8.080614141752594e-06,
"loss": 0.2847,
"step": 4550
},
{
"epoch": 2.24966011617847,
"grad_norm": 0.11950657816038898,
"learning_rate": 8.076792794019175e-06,
"loss": 0.3093,
"step": 4551
},
{
"epoch": 2.25015449264615,
"grad_norm": 0.11992671601204175,
"learning_rate": 8.072971737890129e-06,
"loss": 0.2956,
"step": 4552
},
{
"epoch": 2.25064886911383,
"grad_norm": 0.12436062559579227,
"learning_rate": 8.069150973944826e-06,
"loss": 0.3179,
"step": 4553
},
{
"epoch": 2.25114324558151,
"grad_norm": 0.11400341063968487,
"learning_rate": 8.065330502762583e-06,
"loss": 0.2883,
"step": 4554
},
{
"epoch": 2.25114324558151,
"eval_loss": 0.5107741355895996,
"eval_runtime": 100.9168,
"eval_samples_per_second": 300.782,
"eval_steps_per_second": 37.605,
"step": 4554
},
{
"epoch": 2.2516376220491905,
"grad_norm": 0.13126957488359847,
"learning_rate": 8.061510324922672e-06,
"loss": 0.309,
"step": 4555
},
{
"epoch": 2.2521319985168704,
"grad_norm": 0.12438373172978597,
"learning_rate": 8.057690441004331e-06,
"loss": 0.2976,
"step": 4556
},
{
"epoch": 2.2526263749845508,
"grad_norm": 0.13140375563421558,
"learning_rate": 8.053870851586741e-06,
"loss": 0.3033,
"step": 4557
},
{
"epoch": 2.253120751452231,
"grad_norm": 0.11712690928606888,
"learning_rate": 8.050051557249046e-06,
"loss": 0.2807,
"step": 4558
},
{
"epoch": 2.253615127919911,
"grad_norm": 0.12160160906194807,
"learning_rate": 8.046232558570341e-06,
"loss": 0.2934,
"step": 4559
},
{
"epoch": 2.254109504387591,
"grad_norm": 0.1202481761281156,
"learning_rate": 8.042413856129675e-06,
"loss": 0.2926,
"step": 4560
},
{
"epoch": 2.2546038808552713,
"grad_norm": 0.11916436708104705,
"learning_rate": 8.038595450506061e-06,
"loss": 0.2939,
"step": 4561
},
{
"epoch": 2.2550982573229517,
"grad_norm": 0.1227589995643579,
"learning_rate": 8.034777342278459e-06,
"loss": 0.2938,
"step": 4562
},
{
"epoch": 2.2555926337906316,
"grad_norm": 0.11872685178426023,
"learning_rate": 8.030959532025783e-06,
"loss": 0.2958,
"step": 4563
},
{
"epoch": 2.2560870102583115,
"grad_norm": 0.14938647972363722,
"learning_rate": 8.027142020326908e-06,
"loss": 0.2875,
"step": 4564
},
{
"epoch": 2.256581386725992,
"grad_norm": 0.12645054620885357,
"learning_rate": 8.02332480776066e-06,
"loss": 0.2897,
"step": 4565
},
{
"epoch": 2.257075763193672,
"grad_norm": 0.11905124569413957,
"learning_rate": 8.019507894905814e-06,
"loss": 0.2942,
"step": 4566
},
{
"epoch": 2.257570139661352,
"grad_norm": 0.12370755934419957,
"learning_rate": 8.015691282341113e-06,
"loss": 0.2966,
"step": 4567
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.1188840742371148,
"learning_rate": 8.011874970645248e-06,
"loss": 0.3124,
"step": 4568
},
{
"epoch": 2.2585588925967124,
"grad_norm": 0.12448597495202621,
"learning_rate": 8.008058960396858e-06,
"loss": 0.2922,
"step": 4569
},
{
"epoch": 2.2590532690643927,
"grad_norm": 0.12607739930276834,
"learning_rate": 8.004243252174546e-06,
"loss": 0.2887,
"step": 4570
},
{
"epoch": 2.2595476455320727,
"grad_norm": 0.12079869310196416,
"learning_rate": 8.000427846556858e-06,
"loss": 0.2884,
"step": 4571
},
{
"epoch": 2.2600420219997526,
"grad_norm": 0.11612275159874436,
"learning_rate": 7.99661274412231e-06,
"loss": 0.299,
"step": 4572
},
{
"epoch": 2.260536398467433,
"grad_norm": 0.12170034627364891,
"learning_rate": 7.992797945449357e-06,
"loss": 0.3196,
"step": 4573
},
{
"epoch": 2.2610307749351133,
"grad_norm": 0.1279978094386449,
"learning_rate": 7.988983451116418e-06,
"loss": 0.298,
"step": 4574
},
{
"epoch": 2.261525151402793,
"grad_norm": 0.12060915901254786,
"learning_rate": 7.985169261701862e-06,
"loss": 0.282,
"step": 4575
},
{
"epoch": 2.2620195278704736,
"grad_norm": 0.11719944190243944,
"learning_rate": 7.981355377784008e-06,
"loss": 0.3166,
"step": 4576
},
{
"epoch": 2.2625139043381535,
"grad_norm": 0.12192540213619704,
"learning_rate": 7.97754179994113e-06,
"loss": 0.3036,
"step": 4577
},
{
"epoch": 2.263008280805834,
"grad_norm": 0.12420892586950467,
"learning_rate": 7.973728528751465e-06,
"loss": 0.3083,
"step": 4578
},
{
"epoch": 2.2635026572735137,
"grad_norm": 0.1328794486338466,
"learning_rate": 7.969915564793195e-06,
"loss": 0.3098,
"step": 4579
},
{
"epoch": 2.263997033741194,
"grad_norm": 0.1307458181362655,
"learning_rate": 7.966102908644454e-06,
"loss": 0.3116,
"step": 4580
},
{
"epoch": 2.264491410208874,
"grad_norm": 0.11978001725659276,
"learning_rate": 7.962290560883336e-06,
"loss": 0.2859,
"step": 4581
},
{
"epoch": 2.2649857866765544,
"grad_norm": 0.11514818255673599,
"learning_rate": 7.958478522087876e-06,
"loss": 0.2812,
"step": 4582
},
{
"epoch": 2.2654801631442343,
"grad_norm": 0.11857061224238666,
"learning_rate": 7.95466679283608e-06,
"loss": 0.3087,
"step": 4583
},
{
"epoch": 2.2659745396119146,
"grad_norm": 0.12009397635453062,
"learning_rate": 7.950855373705897e-06,
"loss": 0.301,
"step": 4584
},
{
"epoch": 2.2664689160795946,
"grad_norm": 0.12597486100281943,
"learning_rate": 7.947044265275224e-06,
"loss": 0.3093,
"step": 4585
},
{
"epoch": 2.266963292547275,
"grad_norm": 0.11821231401991568,
"learning_rate": 7.94323346812192e-06,
"loss": 0.286,
"step": 4586
},
{
"epoch": 2.267457669014955,
"grad_norm": 0.12829650365428846,
"learning_rate": 7.939422982823792e-06,
"loss": 0.3005,
"step": 4587
},
{
"epoch": 2.267952045482635,
"grad_norm": 0.12028171949722972,
"learning_rate": 7.935612809958602e-06,
"loss": 0.2833,
"step": 4588
},
{
"epoch": 2.268446421950315,
"grad_norm": 0.11412747061147535,
"learning_rate": 7.931802950104063e-06,
"loss": 0.2852,
"step": 4589
},
{
"epoch": 2.2689407984179955,
"grad_norm": 0.12124462805574299,
"learning_rate": 7.927993403837842e-06,
"loss": 0.2977,
"step": 4590
},
{
"epoch": 2.2694351748856754,
"grad_norm": 0.11947410093055352,
"learning_rate": 7.92418417173756e-06,
"loss": 0.3103,
"step": 4591
},
{
"epoch": 2.2699295513533557,
"grad_norm": 0.12448784936109729,
"learning_rate": 7.920375254380783e-06,
"loss": 0.3023,
"step": 4592
},
{
"epoch": 2.2704239278210356,
"grad_norm": 0.12061221146822713,
"learning_rate": 7.916566652345033e-06,
"loss": 0.293,
"step": 4593
},
{
"epoch": 2.270918304288716,
"grad_norm": 0.12203269473745962,
"learning_rate": 7.912758366207793e-06,
"loss": 0.2831,
"step": 4594
},
{
"epoch": 2.271412680756396,
"grad_norm": 0.12199786535757888,
"learning_rate": 7.908950396546487e-06,
"loss": 0.2819,
"step": 4595
},
{
"epoch": 2.2719070572240763,
"grad_norm": 0.12127457539995176,
"learning_rate": 7.905142743938494e-06,
"loss": 0.3068,
"step": 4596
},
{
"epoch": 2.272401433691756,
"grad_norm": 0.12121710760893771,
"learning_rate": 7.901335408961143e-06,
"loss": 0.279,
"step": 4597
},
{
"epoch": 2.2728958101594365,
"grad_norm": 0.12259847562547858,
"learning_rate": 7.897528392191722e-06,
"loss": 0.3003,
"step": 4598
},
{
"epoch": 2.2733901866271164,
"grad_norm": 0.1195598546676293,
"learning_rate": 7.893721694207464e-06,
"loss": 0.2852,
"step": 4599
},
{
"epoch": 2.273884563094797,
"grad_norm": 0.12427845545248832,
"learning_rate": 7.889915315585558e-06,
"loss": 0.3009,
"step": 4600
},
{
"epoch": 2.2743789395624767,
"grad_norm": 0.11708320212428755,
"learning_rate": 7.88610925690314e-06,
"loss": 0.2712,
"step": 4601
},
{
"epoch": 2.274873316030157,
"grad_norm": 0.1157151067649292,
"learning_rate": 7.882303518737299e-06,
"loss": 0.3113,
"step": 4602
},
{
"epoch": 2.275367692497837,
"grad_norm": 0.12361217134240562,
"learning_rate": 7.878498101665079e-06,
"loss": 0.2908,
"step": 4603
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.12030106747756757,
"learning_rate": 7.874693006263467e-06,
"loss": 0.3008,
"step": 4604
},
{
"epoch": 2.2763564454331973,
"grad_norm": 0.11685667226996317,
"learning_rate": 7.870888233109415e-06,
"loss": 0.31,
"step": 4605
},
{
"epoch": 2.2768508219008776,
"grad_norm": 0.12219398922178404,
"learning_rate": 7.867083782779813e-06,
"loss": 0.2858,
"step": 4606
},
{
"epoch": 2.2773451983685575,
"grad_norm": 0.11586312782593666,
"learning_rate": 7.86327965585151e-06,
"loss": 0.2852,
"step": 4607
},
{
"epoch": 2.277839574836238,
"grad_norm": 0.11960782879528889,
"learning_rate": 7.859475852901298e-06,
"loss": 0.3014,
"step": 4608
},
{
"epoch": 2.278333951303918,
"grad_norm": 0.12814613280686235,
"learning_rate": 7.855672374505924e-06,
"loss": 0.3013,
"step": 4609
},
{
"epoch": 2.278828327771598,
"grad_norm": 0.11691766079758921,
"learning_rate": 7.851869221242097e-06,
"loss": 0.3027,
"step": 4610
},
{
"epoch": 2.279322704239278,
"grad_norm": 0.1184164520221453,
"learning_rate": 7.848066393686457e-06,
"loss": 0.3027,
"step": 4611
},
{
"epoch": 2.2798170807069584,
"grad_norm": 0.12420790730509927,
"learning_rate": 7.844263892415608e-06,
"loss": 0.2963,
"step": 4612
},
{
"epoch": 2.2803114571746383,
"grad_norm": 0.1213226518138984,
"learning_rate": 7.840461718006098e-06,
"loss": 0.3155,
"step": 4613
},
{
"epoch": 2.2808058336423187,
"grad_norm": 0.12017241444317105,
"learning_rate": 7.83665987103443e-06,
"loss": 0.2945,
"step": 4614
},
{
"epoch": 2.2813002101099986,
"grad_norm": 0.12612615601947796,
"learning_rate": 7.83285835207705e-06,
"loss": 0.3082,
"step": 4615
},
{
"epoch": 2.281794586577679,
"grad_norm": 0.12041902336684174,
"learning_rate": 7.829057161710367e-06,
"loss": 0.3103,
"step": 4616
},
{
"epoch": 2.282288963045359,
"grad_norm": 0.1302578268433436,
"learning_rate": 7.825256300510731e-06,
"loss": 0.3055,
"step": 4617
},
{
"epoch": 2.2827833395130392,
"grad_norm": 0.12165315273724221,
"learning_rate": 7.82145576905444e-06,
"loss": 0.2764,
"step": 4618
},
{
"epoch": 2.283277715980719,
"grad_norm": 0.11317348174516419,
"learning_rate": 7.817655567917747e-06,
"loss": 0.2962,
"step": 4619
},
{
"epoch": 2.2837720924483995,
"grad_norm": 0.118058888415336,
"learning_rate": 7.813855697676856e-06,
"loss": 0.2933,
"step": 4620
},
{
"epoch": 2.2842664689160794,
"grad_norm": 0.11467395819547754,
"learning_rate": 7.810056158907916e-06,
"loss": 0.2974,
"step": 4621
},
{
"epoch": 2.28476084538376,
"grad_norm": 0.5436915556150429,
"learning_rate": 7.80625695218703e-06,
"loss": 0.3195,
"step": 4622
},
{
"epoch": 2.2852552218514397,
"grad_norm": 0.11540051516643092,
"learning_rate": 7.80245807809025e-06,
"loss": 0.2911,
"step": 4623
},
{
"epoch": 2.28574959831912,
"grad_norm": 0.13114014169066285,
"learning_rate": 7.798659537193577e-06,
"loss": 0.3106,
"step": 4624
},
{
"epoch": 2.2862439747868,
"grad_norm": 0.11890883256436731,
"learning_rate": 7.794861330072956e-06,
"loss": 0.278,
"step": 4625
},
{
"epoch": 2.2867383512544803,
"grad_norm": 0.11911937558334347,
"learning_rate": 7.791063457304287e-06,
"loss": 0.3153,
"step": 4626
},
{
"epoch": 2.2872327277221602,
"grad_norm": 0.13369424661869225,
"learning_rate": 7.787265919463424e-06,
"loss": 0.3168,
"step": 4627
},
{
"epoch": 2.2877271041898406,
"grad_norm": 0.12142999057154685,
"learning_rate": 7.783468717126162e-06,
"loss": 0.3004,
"step": 4628
},
{
"epoch": 2.2882214806575205,
"grad_norm": 0.12282606040109735,
"learning_rate": 7.779671850868248e-06,
"loss": 0.2909,
"step": 4629
},
{
"epoch": 2.288715857125201,
"grad_norm": 0.12737703395659902,
"learning_rate": 7.775875321265376e-06,
"loss": 0.2993,
"step": 4630
},
{
"epoch": 2.289210233592881,
"grad_norm": 0.1256125821579249,
"learning_rate": 7.772079128893192e-06,
"loss": 0.3091,
"step": 4631
},
{
"epoch": 2.289704610060561,
"grad_norm": 0.12083869633339261,
"learning_rate": 7.768283274327295e-06,
"loss": 0.301,
"step": 4632
},
{
"epoch": 2.2901989865282415,
"grad_norm": 0.13236045318276832,
"learning_rate": 7.764487758143224e-06,
"loss": 0.2783,
"step": 4633
},
{
"epoch": 2.2906933629959214,
"grad_norm": 0.12094544961447191,
"learning_rate": 7.760692580916467e-06,
"loss": 0.3007,
"step": 4634
},
{
"epoch": 2.2911877394636013,
"grad_norm": 0.13183167694384254,
"learning_rate": 7.756897743222468e-06,
"loss": 0.2855,
"step": 4635
},
{
"epoch": 2.2916821159312817,
"grad_norm": 0.12570443949288132,
"learning_rate": 7.753103245636614e-06,
"loss": 0.3044,
"step": 4636
},
{
"epoch": 2.292176492398962,
"grad_norm": 0.12371842844952495,
"learning_rate": 7.74930908873424e-06,
"loss": 0.2865,
"step": 4637
},
{
"epoch": 2.292670868866642,
"grad_norm": 0.1254616713714016,
"learning_rate": 7.745515273090636e-06,
"loss": 0.3013,
"step": 4638
},
{
"epoch": 2.293165245334322,
"grad_norm": 0.12495904560363481,
"learning_rate": 7.741721799281033e-06,
"loss": 0.2816,
"step": 4639
},
{
"epoch": 2.2936596218020022,
"grad_norm": 0.12246907752643052,
"learning_rate": 7.737928667880616e-06,
"loss": 0.2971,
"step": 4640
},
{
"epoch": 2.2941539982696826,
"grad_norm": 0.14872701740212083,
"learning_rate": 7.734135879464507e-06,
"loss": 0.307,
"step": 4641
},
{
"epoch": 2.2946483747373625,
"grad_norm": 0.11817303108566232,
"learning_rate": 7.730343434607786e-06,
"loss": 0.3135,
"step": 4642
},
{
"epoch": 2.2951427512050424,
"grad_norm": 0.11930990284782977,
"learning_rate": 7.726551333885486e-06,
"loss": 0.2996,
"step": 4643
},
{
"epoch": 2.2956371276727228,
"grad_norm": 0.13910595448111912,
"learning_rate": 7.722759577872575e-06,
"loss": 0.3062,
"step": 4644
},
{
"epoch": 2.296131504140403,
"grad_norm": 0.1186037528011157,
"learning_rate": 7.718968167143972e-06,
"loss": 0.3004,
"step": 4645
},
{
"epoch": 2.296625880608083,
"grad_norm": 0.11782031361211384,
"learning_rate": 7.71517710227455e-06,
"loss": 0.3224,
"step": 4646
},
{
"epoch": 2.297120257075763,
"grad_norm": 0.13094870554701324,
"learning_rate": 7.711386383839127e-06,
"loss": 0.2746,
"step": 4647
},
{
"epoch": 2.2976146335434433,
"grad_norm": 0.11826282082668042,
"learning_rate": 7.707596012412458e-06,
"loss": 0.2796,
"step": 4648
},
{
"epoch": 2.2981090100111237,
"grad_norm": 0.12130780775485583,
"learning_rate": 7.703805988569262e-06,
"loss": 0.2788,
"step": 4649
},
{
"epoch": 2.2986033864788036,
"grad_norm": 0.12170404706002566,
"learning_rate": 7.7000163128842e-06,
"loss": 0.3174,
"step": 4650
},
{
"epoch": 2.299097762946484,
"grad_norm": 0.12281473395920219,
"learning_rate": 7.69622698593187e-06,
"loss": 0.3036,
"step": 4651
},
{
"epoch": 2.299592139414164,
"grad_norm": 0.12468825271262823,
"learning_rate": 7.692438008286828e-06,
"loss": 0.2983,
"step": 4652
},
{
"epoch": 2.300086515881844,
"grad_norm": 0.12122766257127829,
"learning_rate": 7.688649380523573e-06,
"loss": 0.2844,
"step": 4653
},
{
"epoch": 2.300580892349524,
"grad_norm": 0.11956659677220018,
"learning_rate": 7.684861103216558e-06,
"loss": 0.3067,
"step": 4654
},
{
"epoch": 2.3010752688172045,
"grad_norm": 0.12344164087829942,
"learning_rate": 7.681073176940171e-06,
"loss": 0.3005,
"step": 4655
},
{
"epoch": 2.3015696452848844,
"grad_norm": 0.12391035051459394,
"learning_rate": 7.677285602268751e-06,
"loss": 0.3004,
"step": 4656
},
{
"epoch": 2.3020640217525647,
"grad_norm": 0.11678173594504518,
"learning_rate": 7.673498379776593e-06,
"loss": 0.3131,
"step": 4657
},
{
"epoch": 2.3025583982202447,
"grad_norm": 0.12284502944447048,
"learning_rate": 7.669711510037923e-06,
"loss": 0.3018,
"step": 4658
},
{
"epoch": 2.303052774687925,
"grad_norm": 0.1168644400911564,
"learning_rate": 7.665924993626921e-06,
"loss": 0.3008,
"step": 4659
},
{
"epoch": 2.303547151155605,
"grad_norm": 0.1267647879624106,
"learning_rate": 7.66213883111772e-06,
"loss": 0.2981,
"step": 4660
},
{
"epoch": 2.3040415276232853,
"grad_norm": 0.12148733407406805,
"learning_rate": 7.658353023084388e-06,
"loss": 0.2989,
"step": 4661
},
{
"epoch": 2.304535904090965,
"grad_norm": 0.12557600334127192,
"learning_rate": 7.654567570100949e-06,
"loss": 0.3231,
"step": 4662
},
{
"epoch": 2.3050302805586456,
"grad_norm": 0.1192874552150874,
"learning_rate": 7.650782472741367e-06,
"loss": 0.297,
"step": 4663
},
{
"epoch": 2.3055246570263255,
"grad_norm": 0.12588725798242217,
"learning_rate": 7.646997731579546e-06,
"loss": 0.2915,
"step": 4664
},
{
"epoch": 2.306019033494006,
"grad_norm": 0.11446527575825721,
"learning_rate": 7.643213347189356e-06,
"loss": 0.2899,
"step": 4665
},
{
"epoch": 2.3065134099616857,
"grad_norm": 0.11780200477116404,
"learning_rate": 7.639429320144594e-06,
"loss": 0.3035,
"step": 4666
},
{
"epoch": 2.307007786429366,
"grad_norm": 0.12035970380339688,
"learning_rate": 7.63564565101901e-06,
"loss": 0.2784,
"step": 4667
},
{
"epoch": 2.307502162897046,
"grad_norm": 0.12040803460126887,
"learning_rate": 7.631862340386299e-06,
"loss": 0.3106,
"step": 4668
},
{
"epoch": 2.3079965393647264,
"grad_norm": 0.1225434349838537,
"learning_rate": 7.628079388820099e-06,
"loss": 0.3018,
"step": 4669
},
{
"epoch": 2.3084909158324063,
"grad_norm": 0.12721666317911948,
"learning_rate": 7.624296796894001e-06,
"loss": 0.3174,
"step": 4670
},
{
"epoch": 2.3089852923000866,
"grad_norm": 0.12295681448309692,
"learning_rate": 7.620514565181535e-06,
"loss": 0.2988,
"step": 4671
},
{
"epoch": 2.3094796687677666,
"grad_norm": 0.12139923558472504,
"learning_rate": 7.616732694256178e-06,
"loss": 0.3038,
"step": 4672
},
{
"epoch": 2.309974045235447,
"grad_norm": 0.12362224439682595,
"learning_rate": 7.612951184691355e-06,
"loss": 0.3098,
"step": 4673
},
{
"epoch": 2.310468421703127,
"grad_norm": 0.13526004768874053,
"learning_rate": 7.609170037060427e-06,
"loss": 0.2961,
"step": 4674
},
{
"epoch": 2.310962798170807,
"grad_norm": 0.12325813875273786,
"learning_rate": 7.60538925193671e-06,
"loss": 0.303,
"step": 4675
},
{
"epoch": 2.311457174638487,
"grad_norm": 0.11903322544937432,
"learning_rate": 7.601608829893465e-06,
"loss": 0.3023,
"step": 4676
},
{
"epoch": 2.3119515511061675,
"grad_norm": 0.11651881459729674,
"learning_rate": 7.597828771503891e-06,
"loss": 0.2926,
"step": 4677
},
{
"epoch": 2.3124459275738474,
"grad_norm": 0.11788718285358252,
"learning_rate": 7.594049077341137e-06,
"loss": 0.2886,
"step": 4678
},
{
"epoch": 2.3129403040415277,
"grad_norm": 0.1272691407743114,
"learning_rate": 7.590269747978296e-06,
"loss": 0.3066,
"step": 4679
},
{
"epoch": 2.3134346805092076,
"grad_norm": 0.11861588361192059,
"learning_rate": 7.5864907839884005e-06,
"loss": 0.3154,
"step": 4680
},
{
"epoch": 2.313929056976888,
"grad_norm": 0.11947193509061278,
"learning_rate": 7.58271218594444e-06,
"loss": 0.2922,
"step": 4681
},
{
"epoch": 2.314423433444568,
"grad_norm": 0.11714591320706924,
"learning_rate": 7.578933954419336e-06,
"loss": 0.3104,
"step": 4682
},
{
"epoch": 2.3149178099122483,
"grad_norm": 0.12226202179211783,
"learning_rate": 7.57515608998596e-06,
"loss": 0.2843,
"step": 4683
},
{
"epoch": 2.315412186379928,
"grad_norm": 0.1161780345742778,
"learning_rate": 7.571378593217125e-06,
"loss": 0.2973,
"step": 4684
},
{
"epoch": 2.3159065628476085,
"grad_norm": 0.12279493427887442,
"learning_rate": 7.567601464685592e-06,
"loss": 0.3025,
"step": 4685
},
{
"epoch": 2.3164009393152885,
"grad_norm": 0.11640259379555197,
"learning_rate": 7.5638247049640626e-06,
"loss": 0.279,
"step": 4686
},
{
"epoch": 2.316895315782969,
"grad_norm": 0.11900186036993972,
"learning_rate": 7.560048314625187e-06,
"loss": 0.3022,
"step": 4687
},
{
"epoch": 2.3173896922506487,
"grad_norm": 0.12214037403930476,
"learning_rate": 7.556272294241556e-06,
"loss": 0.2904,
"step": 4688
},
{
"epoch": 2.317884068718329,
"grad_norm": 0.11511320070486425,
"learning_rate": 7.552496644385705e-06,
"loss": 0.3112,
"step": 4689
},
{
"epoch": 2.318378445186009,
"grad_norm": 0.1265179555165063,
"learning_rate": 7.548721365630112e-06,
"loss": 0.2879,
"step": 4690
},
{
"epoch": 2.3188728216536894,
"grad_norm": 0.1159105204766444,
"learning_rate": 7.544946458547195e-06,
"loss": 0.2931,
"step": 4691
},
{
"epoch": 2.3193671981213693,
"grad_norm": 0.11656083922497903,
"learning_rate": 7.5411719237093314e-06,
"loss": 0.3034,
"step": 4692
},
{
"epoch": 2.3198615745890496,
"grad_norm": 0.11873911076801083,
"learning_rate": 7.537397761688825e-06,
"loss": 0.309,
"step": 4693
},
{
"epoch": 2.3203559510567295,
"grad_norm": 0.12011100265665021,
"learning_rate": 7.53362397305793e-06,
"loss": 0.2856,
"step": 4694
},
{
"epoch": 2.32085032752441,
"grad_norm": 0.12271408632579314,
"learning_rate": 7.5298505583888424e-06,
"loss": 0.3121,
"step": 4695
},
{
"epoch": 2.32134470399209,
"grad_norm": 0.12492713249943851,
"learning_rate": 7.526077518253706e-06,
"loss": 0.3026,
"step": 4696
},
{
"epoch": 2.32183908045977,
"grad_norm": 0.12659855574987808,
"learning_rate": 7.5223048532245955e-06,
"loss": 0.3138,
"step": 4697
},
{
"epoch": 2.32233345692745,
"grad_norm": 0.12183830096076351,
"learning_rate": 7.518532563873548e-06,
"loss": 0.3048,
"step": 4698
},
{
"epoch": 2.3228278333951304,
"grad_norm": 0.11816033971289926,
"learning_rate": 7.51476065077253e-06,
"loss": 0.3086,
"step": 4699
},
{
"epoch": 2.3233222098628104,
"grad_norm": 0.12860901512336162,
"learning_rate": 7.5109891144934525e-06,
"loss": 0.2906,
"step": 4700
},
{
"epoch": 2.3238165863304907,
"grad_norm": 0.12326669875557918,
"learning_rate": 7.5072179556081696e-06,
"loss": 0.303,
"step": 4701
},
{
"epoch": 2.3243109627981706,
"grad_norm": 0.12121579842390912,
"learning_rate": 7.503447174688479e-06,
"loss": 0.2931,
"step": 4702
},
{
"epoch": 2.324805339265851,
"grad_norm": 0.11549941656019759,
"learning_rate": 7.499676772306126e-06,
"loss": 0.2942,
"step": 4703
},
{
"epoch": 2.325299715733531,
"grad_norm": 0.11659292299420765,
"learning_rate": 7.495906749032793e-06,
"loss": 0.3041,
"step": 4704
},
{
"epoch": 2.3257940922012112,
"grad_norm": 0.11685434881075145,
"learning_rate": 7.492137105440104e-06,
"loss": 0.2934,
"step": 4705
},
{
"epoch": 2.326288468668891,
"grad_norm": 0.12377739764460761,
"learning_rate": 7.488367842099631e-06,
"loss": 0.3117,
"step": 4706
},
{
"epoch": 2.3267828451365715,
"grad_norm": 0.11877765977675996,
"learning_rate": 7.484598959582879e-06,
"loss": 0.2879,
"step": 4707
},
{
"epoch": 2.327277221604252,
"grad_norm": 0.12352202842689916,
"learning_rate": 7.480830458461303e-06,
"loss": 0.2968,
"step": 4708
},
{
"epoch": 2.327771598071932,
"grad_norm": 0.12837104368100866,
"learning_rate": 7.477062339306301e-06,
"loss": 0.2908,
"step": 4709
},
{
"epoch": 2.3282659745396117,
"grad_norm": 0.12370890160544662,
"learning_rate": 7.473294602689209e-06,
"loss": 0.2974,
"step": 4710
},
{
"epoch": 2.328760351007292,
"grad_norm": 0.12087202138226298,
"learning_rate": 7.469527249181307e-06,
"loss": 0.2845,
"step": 4711
},
{
"epoch": 2.3292547274749724,
"grad_norm": 0.11579416619975937,
"learning_rate": 7.4657602793538135e-06,
"loss": 0.2831,
"step": 4712
},
{
"epoch": 2.3297491039426523,
"grad_norm": 0.12863649625423984,
"learning_rate": 7.461993693777893e-06,
"loss": 0.2981,
"step": 4713
},
{
"epoch": 2.3302434804103322,
"grad_norm": 0.11789456445368596,
"learning_rate": 7.458227493024651e-06,
"loss": 0.3034,
"step": 4714
},
{
"epoch": 2.3307378568780126,
"grad_norm": 0.11764939475248865,
"learning_rate": 7.454461677665137e-06,
"loss": 0.291,
"step": 4715
},
{
"epoch": 2.331232233345693,
"grad_norm": 0.11753663988204016,
"learning_rate": 7.450696248270333e-06,
"loss": 0.3176,
"step": 4716
},
{
"epoch": 2.331726609813373,
"grad_norm": 0.12194678899715057,
"learning_rate": 7.4469312054111695e-06,
"loss": 0.3102,
"step": 4717
},
{
"epoch": 2.332220986281053,
"grad_norm": 0.13186597881980125,
"learning_rate": 7.443166549658521e-06,
"loss": 0.3405,
"step": 4718
},
{
"epoch": 2.332715362748733,
"grad_norm": 0.12113369540353373,
"learning_rate": 7.4394022815831945e-06,
"loss": 0.296,
"step": 4719
},
{
"epoch": 2.3332097392164135,
"grad_norm": 0.12315896436119153,
"learning_rate": 7.435638401755949e-06,
"loss": 0.3146,
"step": 4720
},
{
"epoch": 2.3337041156840934,
"grad_norm": 0.1185923883636943,
"learning_rate": 7.4318749107474776e-06,
"loss": 0.298,
"step": 4721
},
{
"epoch": 2.3341984921517733,
"grad_norm": 0.12028640331141607,
"learning_rate": 7.428111809128415e-06,
"loss": 0.2669,
"step": 4722
},
{
"epoch": 2.3346928686194537,
"grad_norm": 0.12247631750600893,
"learning_rate": 7.424349097469337e-06,
"loss": 0.3019,
"step": 4723
},
{
"epoch": 2.335187245087134,
"grad_norm": 0.12441303614495439,
"learning_rate": 7.420586776340757e-06,
"loss": 0.3016,
"step": 4724
},
{
"epoch": 2.335681621554814,
"grad_norm": 0.12305918067033214,
"learning_rate": 7.416824846313142e-06,
"loss": 0.2819,
"step": 4725
},
{
"epoch": 2.3361759980224943,
"grad_norm": 0.11571599562948127,
"learning_rate": 7.413063307956887e-06,
"loss": 0.3078,
"step": 4726
},
{
"epoch": 2.3366703744901742,
"grad_norm": 0.11978317858574433,
"learning_rate": 7.40930216184233e-06,
"loss": 0.3048,
"step": 4727
},
{
"epoch": 2.3371647509578546,
"grad_norm": 0.12088185483248909,
"learning_rate": 7.405541408539752e-06,
"loss": 0.2947,
"step": 4728
},
{
"epoch": 2.3376591274255345,
"grad_norm": 0.11870238593973907,
"learning_rate": 7.401781048619377e-06,
"loss": 0.2913,
"step": 4729
},
{
"epoch": 2.338153503893215,
"grad_norm": 0.12045737122927673,
"learning_rate": 7.398021082651354e-06,
"loss": 0.3136,
"step": 4730
},
{
"epoch": 2.3386478803608948,
"grad_norm": 0.11982482887450356,
"learning_rate": 7.394261511205798e-06,
"loss": 0.2926,
"step": 4731
},
{
"epoch": 2.339142256828575,
"grad_norm": 0.1188579994372803,
"learning_rate": 7.390502334852747e-06,
"loss": 0.3007,
"step": 4732
},
{
"epoch": 2.339636633296255,
"grad_norm": 0.1188286802088766,
"learning_rate": 7.386743554162179e-06,
"loss": 0.3068,
"step": 4733
},
{
"epoch": 2.3401310097639354,
"grad_norm": 0.12199544352782359,
"learning_rate": 7.382985169704016e-06,
"loss": 0.3013,
"step": 4734
},
{
"epoch": 2.3406253862316153,
"grad_norm": 0.12264625236158591,
"learning_rate": 7.379227182048117e-06,
"loss": 0.2808,
"step": 4735
},
{
"epoch": 2.3411197626992957,
"grad_norm": 0.12517329644024042,
"learning_rate": 7.375469591764288e-06,
"loss": 0.3033,
"step": 4736
},
{
"epoch": 2.3416141391669756,
"grad_norm": 0.12042139499687247,
"learning_rate": 7.371712399422269e-06,
"loss": 0.3063,
"step": 4737
},
{
"epoch": 2.342108515634656,
"grad_norm": 0.11993413958607242,
"learning_rate": 7.367955605591739e-06,
"loss": 0.3127,
"step": 4738
},
{
"epoch": 2.342602892102336,
"grad_norm": 0.25676169123521686,
"learning_rate": 7.36419921084232e-06,
"loss": 0.3026,
"step": 4739
},
{
"epoch": 2.343097268570016,
"grad_norm": 0.1245327412981073,
"learning_rate": 7.360443215743565e-06,
"loss": 0.3376,
"step": 4740
},
{
"epoch": 2.343591645037696,
"grad_norm": 0.11941990102840876,
"learning_rate": 7.356687620864984e-06,
"loss": 0.301,
"step": 4741
},
{
"epoch": 2.3440860215053765,
"grad_norm": 0.12733208950216002,
"learning_rate": 7.352932426776008e-06,
"loss": 0.3114,
"step": 4742
},
{
"epoch": 2.3445803979730564,
"grad_norm": 0.12676690040401747,
"learning_rate": 7.349177634046014e-06,
"loss": 0.3071,
"step": 4743
},
{
"epoch": 2.3450747744407368,
"grad_norm": 0.13382205431455033,
"learning_rate": 7.345423243244323e-06,
"loss": 0.3119,
"step": 4744
},
{
"epoch": 2.3455691509084167,
"grad_norm": 0.12178249864470311,
"learning_rate": 7.3416692549401905e-06,
"loss": 0.2718,
"step": 4745
},
{
"epoch": 2.346063527376097,
"grad_norm": 0.12423489141065461,
"learning_rate": 7.337915669702802e-06,
"loss": 0.3101,
"step": 4746
},
{
"epoch": 2.346557903843777,
"grad_norm": 0.12730771200147606,
"learning_rate": 7.334162488101303e-06,
"loss": 0.313,
"step": 4747
},
{
"epoch": 2.3470522803114573,
"grad_norm": 0.12516945541416785,
"learning_rate": 7.330409710704764e-06,
"loss": 0.3024,
"step": 4748
},
{
"epoch": 2.347546656779137,
"grad_norm": 0.11874611568052662,
"learning_rate": 7.326657338082191e-06,
"loss": 0.3,
"step": 4749
},
{
"epoch": 2.3480410332468176,
"grad_norm": 0.12400915870529168,
"learning_rate": 7.322905370802535e-06,
"loss": 0.2905,
"step": 4750
},
{
"epoch": 2.3485354097144975,
"grad_norm": 0.11843651289074053,
"learning_rate": 7.319153809434684e-06,
"loss": 0.2999,
"step": 4751
},
{
"epoch": 2.349029786182178,
"grad_norm": 0.11867924125068892,
"learning_rate": 7.3154026545474696e-06,
"loss": 0.2863,
"step": 4752
},
{
"epoch": 2.3495241626498578,
"grad_norm": 0.12100714307471798,
"learning_rate": 7.311651906709654e-06,
"loss": 0.3046,
"step": 4753
},
{
"epoch": 2.350018539117538,
"grad_norm": 0.11610879080762856,
"learning_rate": 7.307901566489939e-06,
"loss": 0.3086,
"step": 4754
},
{
"epoch": 2.350512915585218,
"grad_norm": 0.12961418717075093,
"learning_rate": 7.30415163445697e-06,
"loss": 0.2991,
"step": 4755
},
{
"epoch": 2.3510072920528984,
"grad_norm": 0.12238848896578085,
"learning_rate": 7.300402111179321e-06,
"loss": 0.3097,
"step": 4756
},
{
"epoch": 2.3515016685205783,
"grad_norm": 0.13263653593877867,
"learning_rate": 7.296652997225512e-06,
"loss": 0.2969,
"step": 4757
},
{
"epoch": 2.3519960449882586,
"grad_norm": 0.12250867191010659,
"learning_rate": 7.292904293164e-06,
"loss": 0.3001,
"step": 4758
},
{
"epoch": 2.3524904214559386,
"grad_norm": 0.11634935727144137,
"learning_rate": 7.28915599956318e-06,
"loss": 0.3073,
"step": 4759
},
{
"epoch": 2.352984797923619,
"grad_norm": 0.11869437773625423,
"learning_rate": 7.285408116991382e-06,
"loss": 0.2875,
"step": 4760
},
{
"epoch": 2.353479174391299,
"grad_norm": 0.12028600981029014,
"learning_rate": 7.281660646016873e-06,
"loss": 0.3099,
"step": 4761
},
{
"epoch": 2.353973550858979,
"grad_norm": 0.1278856518842002,
"learning_rate": 7.277913587207857e-06,
"loss": 0.2851,
"step": 4762
},
{
"epoch": 2.354467927326659,
"grad_norm": 0.12118041950268427,
"learning_rate": 7.274166941132485e-06,
"loss": 0.3262,
"step": 4763
},
{
"epoch": 2.3549623037943395,
"grad_norm": 0.12760639148373262,
"learning_rate": 7.27042070835884e-06,
"loss": 0.3094,
"step": 4764
},
{
"epoch": 2.3554566802620194,
"grad_norm": 0.12158872658283057,
"learning_rate": 7.266674889454932e-06,
"loss": 0.2764,
"step": 4765
},
{
"epoch": 2.3559510567296997,
"grad_norm": 0.12001601912372939,
"learning_rate": 7.262929484988721e-06,
"loss": 0.2931,
"step": 4766
},
{
"epoch": 2.3564454331973796,
"grad_norm": 0.11950045772630855,
"learning_rate": 7.259184495528102e-06,
"loss": 0.2959,
"step": 4767
},
{
"epoch": 2.35693980966506,
"grad_norm": 0.12206981257049314,
"learning_rate": 7.255439921640901e-06,
"loss": 0.2916,
"step": 4768
},
{
"epoch": 2.35743418613274,
"grad_norm": 0.12271455580847608,
"learning_rate": 7.251695763894889e-06,
"loss": 0.2969,
"step": 4769
},
{
"epoch": 2.3579285626004203,
"grad_norm": 0.11353809913998802,
"learning_rate": 7.2479520228577705e-06,
"loss": 0.2983,
"step": 4770
},
{
"epoch": 2.3584229390681,
"grad_norm": 0.12011258210432371,
"learning_rate": 7.244208699097187e-06,
"loss": 0.3094,
"step": 4771
},
{
"epoch": 2.3589173155357805,
"grad_norm": 0.12283207969675548,
"learning_rate": 7.240465793180713e-06,
"loss": 0.3328,
"step": 4772
},
{
"epoch": 2.3594116920034605,
"grad_norm": 0.12289404502835787,
"learning_rate": 7.236723305675859e-06,
"loss": 0.2761,
"step": 4773
},
{
"epoch": 2.359906068471141,
"grad_norm": 0.11144523507874941,
"learning_rate": 7.232981237150089e-06,
"loss": 0.2887,
"step": 4774
},
{
"epoch": 2.3604004449388207,
"grad_norm": 0.11450378021683043,
"learning_rate": 7.22923958817078e-06,
"loss": 0.3025,
"step": 4775
},
{
"epoch": 2.360894821406501,
"grad_norm": 0.12169781401253474,
"learning_rate": 7.225498359305257e-06,
"loss": 0.2959,
"step": 4776
},
{
"epoch": 2.361389197874181,
"grad_norm": 0.11367705526783699,
"learning_rate": 7.221757551120783e-06,
"loss": 0.2846,
"step": 4777
},
{
"epoch": 2.3618835743418614,
"grad_norm": 0.1203446438161089,
"learning_rate": 7.218017164184557e-06,
"loss": 0.3008,
"step": 4778
},
{
"epoch": 2.3623779508095413,
"grad_norm": 0.11258243108922315,
"learning_rate": 7.214277199063697e-06,
"loss": 0.2779,
"step": 4779
},
{
"epoch": 2.3628723272772216,
"grad_norm": 0.12040949361639479,
"learning_rate": 7.2105376563252895e-06,
"loss": 0.2936,
"step": 4780
},
{
"epoch": 2.3633667037449015,
"grad_norm": 0.1287428986082287,
"learning_rate": 7.206798536536333e-06,
"loss": 0.3039,
"step": 4781
},
{
"epoch": 2.363861080212582,
"grad_norm": 0.11901600246228497,
"learning_rate": 7.2030598402637615e-06,
"loss": 0.3053,
"step": 4782
},
{
"epoch": 2.3643554566802623,
"grad_norm": 0.12505035808877615,
"learning_rate": 7.199321568074458e-06,
"loss": 0.2925,
"step": 4783
},
{
"epoch": 2.364849833147942,
"grad_norm": 0.114244152654461,
"learning_rate": 7.1955837205352295e-06,
"loss": 0.3056,
"step": 4784
},
{
"epoch": 2.365344209615622,
"grad_norm": 0.12312246887426194,
"learning_rate": 7.1918462982128275e-06,
"loss": 0.2839,
"step": 4785
},
{
"epoch": 2.3658385860833024,
"grad_norm": 0.11931451048333745,
"learning_rate": 7.188109301673935e-06,
"loss": 0.3118,
"step": 4786
},
{
"epoch": 2.366332962550983,
"grad_norm": 0.11956301564728095,
"learning_rate": 7.184372731485167e-06,
"loss": 0.2891,
"step": 4787
},
{
"epoch": 2.3668273390186627,
"grad_norm": 0.2847468033832284,
"learning_rate": 7.180636588213083e-06,
"loss": 0.2996,
"step": 4788
},
{
"epoch": 2.3673217154863426,
"grad_norm": 0.2072083412176583,
"learning_rate": 7.176900872424164e-06,
"loss": 0.2997,
"step": 4789
},
{
"epoch": 2.367816091954023,
"grad_norm": 0.13792271515829588,
"learning_rate": 7.173165584684836e-06,
"loss": 0.3287,
"step": 4790
},
{
"epoch": 2.3683104684217033,
"grad_norm": 0.11975981054923604,
"learning_rate": 7.169430725561463e-06,
"loss": 0.2914,
"step": 4791
},
{
"epoch": 2.3688048448893833,
"grad_norm": 0.12329665505725139,
"learning_rate": 7.165696295620338e-06,
"loss": 0.2871,
"step": 4792
},
{
"epoch": 2.369299221357063,
"grad_norm": 0.12299005597826669,
"learning_rate": 7.161962295427688e-06,
"loss": 0.3017,
"step": 4793
},
{
"epoch": 2.3697935978247435,
"grad_norm": 0.1185122319215121,
"learning_rate": 7.158228725549679e-06,
"loss": 0.3123,
"step": 4794
},
{
"epoch": 2.370287974292424,
"grad_norm": 0.12409992561930855,
"learning_rate": 7.154495586552405e-06,
"loss": 0.328,
"step": 4795
},
{
"epoch": 2.370782350760104,
"grad_norm": 0.127088646241909,
"learning_rate": 7.150762879001906e-06,
"loss": 0.2903,
"step": 4796
},
{
"epoch": 2.3712767272277837,
"grad_norm": 0.11832821393420226,
"learning_rate": 7.147030603464149e-06,
"loss": 0.2716,
"step": 4797
},
{
"epoch": 2.371771103695464,
"grad_norm": 0.4227355098453066,
"learning_rate": 7.1432987605050345e-06,
"loss": 0.3623,
"step": 4798
},
{
"epoch": 2.3722654801631444,
"grad_norm": 0.1341203160904066,
"learning_rate": 7.1395673506903985e-06,
"loss": 0.3365,
"step": 4799
},
{
"epoch": 2.3727598566308243,
"grad_norm": 0.1281908704886917,
"learning_rate": 7.135836374586013e-06,
"loss": 0.3024,
"step": 4800
},
{
"epoch": 2.3732542330985047,
"grad_norm": 0.11917054744865076,
"learning_rate": 7.132105832757585e-06,
"loss": 0.2913,
"step": 4801
},
{
"epoch": 2.3737486095661846,
"grad_norm": 0.12652930023576467,
"learning_rate": 7.128375725770753e-06,
"loss": 0.3024,
"step": 4802
},
{
"epoch": 2.374242986033865,
"grad_norm": 0.1167665940273665,
"learning_rate": 7.124646054191093e-06,
"loss": 0.29,
"step": 4803
},
{
"epoch": 2.374737362501545,
"grad_norm": 0.12215588973826659,
"learning_rate": 7.120916818584112e-06,
"loss": 0.2949,
"step": 4804
},
{
"epoch": 2.3752317389692252,
"grad_norm": 0.11932507833896012,
"learning_rate": 7.1171880195152485e-06,
"loss": 0.3032,
"step": 4805
},
{
"epoch": 2.375726115436905,
"grad_norm": 0.1262010350195653,
"learning_rate": 7.113459657549876e-06,
"loss": 0.3031,
"step": 4806
},
{
"epoch": 2.3762204919045855,
"grad_norm": 0.12477811047402412,
"learning_rate": 7.109731733253313e-06,
"loss": 0.3007,
"step": 4807
},
{
"epoch": 2.3767148683722654,
"grad_norm": 0.11871004680304072,
"learning_rate": 7.106004247190797e-06,
"loss": 0.2847,
"step": 4808
},
{
"epoch": 2.377209244839946,
"grad_norm": 0.11732643081325171,
"learning_rate": 7.102277199927503e-06,
"loss": 0.3022,
"step": 4809
},
{
"epoch": 2.3777036213076257,
"grad_norm": 0.4526492935581887,
"learning_rate": 7.098550592028542e-06,
"loss": 0.3399,
"step": 4810
},
{
"epoch": 2.378197997775306,
"grad_norm": 0.11825405989194657,
"learning_rate": 7.09482442405896e-06,
"loss": 0.2881,
"step": 4811
},
{
"epoch": 2.378692374242986,
"grad_norm": 0.12985637245341378,
"learning_rate": 7.091098696583724e-06,
"loss": 0.3064,
"step": 4812
},
{
"epoch": 2.3791867507106663,
"grad_norm": 0.12410966363819313,
"learning_rate": 7.087373410167757e-06,
"loss": 0.3,
"step": 4813
},
{
"epoch": 2.3796811271783462,
"grad_norm": 0.12018152697016592,
"learning_rate": 7.0836485653758956e-06,
"loss": 0.285,
"step": 4814
},
{
"epoch": 2.3801755036460266,
"grad_norm": 0.12351879460943509,
"learning_rate": 7.079924162772913e-06,
"loss": 0.3006,
"step": 4815
},
{
"epoch": 2.3806698801137065,
"grad_norm": 0.12502253952052478,
"learning_rate": 7.076200202923522e-06,
"loss": 0.3442,
"step": 4816
},
{
"epoch": 2.381164256581387,
"grad_norm": 0.12679811392923873,
"learning_rate": 7.07247668639236e-06,
"loss": 0.3067,
"step": 4817
},
{
"epoch": 2.3816586330490668,
"grad_norm": 0.13849688494177845,
"learning_rate": 7.068753613744006e-06,
"loss": 0.319,
"step": 4818
},
{
"epoch": 2.382153009516747,
"grad_norm": 0.1204613650046403,
"learning_rate": 7.065030985542967e-06,
"loss": 0.2861,
"step": 4819
},
{
"epoch": 2.382647385984427,
"grad_norm": 0.114367657354327,
"learning_rate": 7.061308802353683e-06,
"loss": 0.2785,
"step": 4820
},
{
"epoch": 2.3831417624521074,
"grad_norm": 0.12387810383256194,
"learning_rate": 7.057587064740521e-06,
"loss": 0.3166,
"step": 4821
},
{
"epoch": 2.3836361389197873,
"grad_norm": 0.12253321856864666,
"learning_rate": 7.0538657732677875e-06,
"loss": 0.2909,
"step": 4822
},
{
"epoch": 2.3841305153874677,
"grad_norm": 0.12817740497148486,
"learning_rate": 7.050144928499727e-06,
"loss": 0.3187,
"step": 4823
},
{
"epoch": 2.3846248918551476,
"grad_norm": 0.12090062676434803,
"learning_rate": 7.0464245310005e-06,
"loss": 0.3,
"step": 4824
},
{
"epoch": 2.385119268322828,
"grad_norm": 0.11708064905630869,
"learning_rate": 7.042704581334212e-06,
"loss": 0.295,
"step": 4825
},
{
"epoch": 2.385613644790508,
"grad_norm": 0.12635057459832236,
"learning_rate": 7.038985080064897e-06,
"loss": 0.3131,
"step": 4826
},
{
"epoch": 2.386108021258188,
"grad_norm": 0.1271486550624924,
"learning_rate": 7.035266027756522e-06,
"loss": 0.3199,
"step": 4827
},
{
"epoch": 2.386602397725868,
"grad_norm": 0.12384862993884917,
"learning_rate": 7.031547424972975e-06,
"loss": 0.299,
"step": 4828
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.12793579854602294,
"learning_rate": 7.0278292722781e-06,
"loss": 0.3028,
"step": 4829
},
{
"epoch": 2.3875911506612284,
"grad_norm": 0.11960228562526583,
"learning_rate": 7.02411157023565e-06,
"loss": 0.2841,
"step": 4830
},
{
"epoch": 2.3880855271289088,
"grad_norm": 0.12213022644972879,
"learning_rate": 7.0203943194093185e-06,
"loss": 0.2908,
"step": 4831
},
{
"epoch": 2.3885799035965887,
"grad_norm": 0.11812251542656897,
"learning_rate": 7.016677520362729e-06,
"loss": 0.3076,
"step": 4832
},
{
"epoch": 2.389074280064269,
"grad_norm": 0.12197359269182094,
"learning_rate": 7.012961173659437e-06,
"loss": 0.2929,
"step": 4833
},
{
"epoch": 2.389568656531949,
"grad_norm": 0.13514089860773568,
"learning_rate": 7.009245279862934e-06,
"loss": 0.3064,
"step": 4834
},
{
"epoch": 2.3900630329996293,
"grad_norm": 0.12430116403196531,
"learning_rate": 7.0055298395366365e-06,
"loss": 0.3095,
"step": 4835
},
{
"epoch": 2.390557409467309,
"grad_norm": 0.12457090940967472,
"learning_rate": 7.0018148532438955e-06,
"loss": 0.339,
"step": 4836
},
{
"epoch": 2.3910517859349896,
"grad_norm": 0.2861636487223316,
"learning_rate": 6.998100321547991e-06,
"loss": 0.3152,
"step": 4837
},
{
"epoch": 2.3915461624026695,
"grad_norm": 0.1287759778406791,
"learning_rate": 6.994386245012135e-06,
"loss": 0.2867,
"step": 4838
},
{
"epoch": 2.39204053887035,
"grad_norm": 0.19660446925360484,
"learning_rate": 6.990672624199467e-06,
"loss": 0.2935,
"step": 4839
},
{
"epoch": 2.3925349153380298,
"grad_norm": 0.12147296763687923,
"learning_rate": 6.986959459673068e-06,
"loss": 0.2967,
"step": 4840
},
{
"epoch": 2.39302929180571,
"grad_norm": 0.11568609832000866,
"learning_rate": 6.983246751995939e-06,
"loss": 0.2933,
"step": 4841
},
{
"epoch": 2.39352366827339,
"grad_norm": 0.1225149382430467,
"learning_rate": 6.979534501731017e-06,
"loss": 0.2963,
"step": 4842
},
{
"epoch": 2.3940180447410704,
"grad_norm": 0.13056244093071137,
"learning_rate": 6.975822709441166e-06,
"loss": 0.3115,
"step": 4843
},
{
"epoch": 2.3945124212087503,
"grad_norm": 0.11670721989458002,
"learning_rate": 6.972111375689183e-06,
"loss": 0.3019,
"step": 4844
},
{
"epoch": 2.3950067976764307,
"grad_norm": 0.12321466512970129,
"learning_rate": 6.9684005010378e-06,
"loss": 0.2736,
"step": 4845
},
{
"epoch": 2.3955011741441106,
"grad_norm": 0.12006809271866373,
"learning_rate": 6.964690086049673e-06,
"loss": 0.3171,
"step": 4846
},
{
"epoch": 2.395995550611791,
"grad_norm": 0.11593983854291003,
"learning_rate": 6.960980131287385e-06,
"loss": 0.2909,
"step": 4847
},
{
"epoch": 2.396489927079471,
"grad_norm": 0.12046124253437143,
"learning_rate": 6.957270637313458e-06,
"loss": 0.3092,
"step": 4848
},
{
"epoch": 2.396984303547151,
"grad_norm": 0.11588196093734425,
"learning_rate": 6.95356160469034e-06,
"loss": 0.2988,
"step": 4849
},
{
"epoch": 2.397478680014831,
"grad_norm": 0.12031257537918799,
"learning_rate": 6.949853033980407e-06,
"loss": 0.3189,
"step": 4850
},
{
"epoch": 2.3979730564825115,
"grad_norm": 0.12264725880428658,
"learning_rate": 6.946144925745972e-06,
"loss": 0.2749,
"step": 4851
},
{
"epoch": 2.3984674329501914,
"grad_norm": 0.11990996192465653,
"learning_rate": 6.94243728054927e-06,
"loss": 0.2842,
"step": 4852
},
{
"epoch": 2.3989618094178717,
"grad_norm": 0.12272868451628591,
"learning_rate": 6.938730098952473e-06,
"loss": 0.308,
"step": 4853
},
{
"epoch": 2.399456185885552,
"grad_norm": 0.1228597158803589,
"learning_rate": 6.935023381517672e-06,
"loss": 0.3144,
"step": 4854
},
{
"epoch": 2.399950562353232,
"grad_norm": 0.12378200532949744,
"learning_rate": 6.931317128806895e-06,
"loss": 0.2955,
"step": 4855
},
{
"epoch": 2.400444938820912,
"grad_norm": 0.11613543051608641,
"learning_rate": 6.9276113413821075e-06,
"loss": 0.2835,
"step": 4856
},
{
"epoch": 2.4009393152885923,
"grad_norm": 0.12040004582470343,
"learning_rate": 6.923906019805187e-06,
"loss": 0.3366,
"step": 4857
},
{
"epoch": 2.4014336917562726,
"grad_norm": 0.16804104080855498,
"learning_rate": 6.920201164637953e-06,
"loss": 0.3302,
"step": 4858
},
{
"epoch": 2.4019280682239526,
"grad_norm": 0.11551125801829366,
"learning_rate": 6.9164967764421494e-06,
"loss": 0.2885,
"step": 4859
},
{
"epoch": 2.4024224446916325,
"grad_norm": 0.12045871254400005,
"learning_rate": 6.912792855779453e-06,
"loss": 0.2962,
"step": 4860
},
{
"epoch": 2.402916821159313,
"grad_norm": 0.1303876855432015,
"learning_rate": 6.909089403211459e-06,
"loss": 0.3063,
"step": 4861
},
{
"epoch": 2.403411197626993,
"grad_norm": 0.1183107192343879,
"learning_rate": 6.905386419299709e-06,
"loss": 0.3064,
"step": 4862
},
{
"epoch": 2.403905574094673,
"grad_norm": 0.12841016083129414,
"learning_rate": 6.901683904605663e-06,
"loss": 0.3077,
"step": 4863
},
{
"epoch": 2.404399950562353,
"grad_norm": 0.12562817797086734,
"learning_rate": 6.897981859690706e-06,
"loss": 0.3095,
"step": 4864
},
{
"epoch": 2.4048943270300334,
"grad_norm": 0.12410147732018607,
"learning_rate": 6.894280285116159e-06,
"loss": 0.3071,
"step": 4865
},
{
"epoch": 2.4053887034977137,
"grad_norm": 0.1177865627226221,
"learning_rate": 6.89057918144327e-06,
"loss": 0.2732,
"step": 4866
},
{
"epoch": 2.4058830799653936,
"grad_norm": 0.12585572746890594,
"learning_rate": 6.886878549233215e-06,
"loss": 0.2759,
"step": 4867
},
{
"epoch": 2.4063774564330735,
"grad_norm": 0.12207620059918659,
"learning_rate": 6.8831783890471025e-06,
"loss": 0.3057,
"step": 4868
},
{
"epoch": 2.406871832900754,
"grad_norm": 0.22614876596443428,
"learning_rate": 6.879478701445961e-06,
"loss": 0.2961,
"step": 4869
},
{
"epoch": 2.4073662093684343,
"grad_norm": 0.12548386778242315,
"learning_rate": 6.875779486990754e-06,
"loss": 0.3169,
"step": 4870
},
{
"epoch": 2.407860585836114,
"grad_norm": 0.11668917264645362,
"learning_rate": 6.872080746242369e-06,
"loss": 0.3116,
"step": 4871
},
{
"epoch": 2.4083549623037945,
"grad_norm": 0.12769136776530948,
"learning_rate": 6.868382479761621e-06,
"loss": 0.2745,
"step": 4872
},
{
"epoch": 2.4088493387714744,
"grad_norm": 0.1247314792623733,
"learning_rate": 6.864684688109266e-06,
"loss": 0.2908,
"step": 4873
},
{
"epoch": 2.409343715239155,
"grad_norm": 0.13155819403117402,
"learning_rate": 6.86098737184597e-06,
"loss": 0.2861,
"step": 4874
},
{
"epoch": 2.4098380917068347,
"grad_norm": 0.1208253047686487,
"learning_rate": 6.8572905315323365e-06,
"loss": 0.3112,
"step": 4875
},
{
"epoch": 2.410332468174515,
"grad_norm": 0.12493161394774648,
"learning_rate": 6.853594167728896e-06,
"loss": 0.2731,
"step": 4876
},
{
"epoch": 2.410826844642195,
"grad_norm": 0.1143343874283613,
"learning_rate": 6.849898280996106e-06,
"loss": 0.2918,
"step": 4877
},
{
"epoch": 2.4113212211098753,
"grad_norm": 0.12379622785324293,
"learning_rate": 6.8462028718943505e-06,
"loss": 0.3263,
"step": 4878
},
{
"epoch": 2.4118155975775553,
"grad_norm": 0.12832436209870104,
"learning_rate": 6.842507940983947e-06,
"loss": 0.2817,
"step": 4879
},
{
"epoch": 2.4123099740452356,
"grad_norm": 0.11944046372889952,
"learning_rate": 6.838813488825129e-06,
"loss": 0.3048,
"step": 4880
},
{
"epoch": 2.4128043505129155,
"grad_norm": 0.11721085955678225,
"learning_rate": 6.835119515978067e-06,
"loss": 0.2874,
"step": 4881
},
{
"epoch": 2.413298726980596,
"grad_norm": 0.12001899639019652,
"learning_rate": 6.831426023002856e-06,
"loss": 0.2863,
"step": 4882
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.11510799171330129,
"learning_rate": 6.827733010459516e-06,
"loss": 0.2751,
"step": 4883
},
{
"epoch": 2.414287479915956,
"grad_norm": 0.12377064859239799,
"learning_rate": 6.8240404789080006e-06,
"loss": 0.3022,
"step": 4884
},
{
"epoch": 2.414781856383636,
"grad_norm": 0.1200142717261807,
"learning_rate": 6.820348428908183e-06,
"loss": 0.3226,
"step": 4885
},
{
"epoch": 2.4152762328513164,
"grad_norm": 0.12119185619812595,
"learning_rate": 6.816656861019871e-06,
"loss": 0.2987,
"step": 4886
},
{
"epoch": 2.4157706093189963,
"grad_norm": 0.1192309528332258,
"learning_rate": 6.812965775802789e-06,
"loss": 0.2821,
"step": 4887
},
{
"epoch": 2.4162649857866767,
"grad_norm": 0.11756368421944208,
"learning_rate": 6.809275173816594e-06,
"loss": 0.3087,
"step": 4888
},
{
"epoch": 2.4167593622543566,
"grad_norm": 0.12335977112025488,
"learning_rate": 6.805585055620877e-06,
"loss": 0.304,
"step": 4889
},
{
"epoch": 2.417253738722037,
"grad_norm": 0.12648238297353742,
"learning_rate": 6.801895421775142e-06,
"loss": 0.287,
"step": 4890
},
{
"epoch": 2.417748115189717,
"grad_norm": 0.12384598341210323,
"learning_rate": 6.79820627283883e-06,
"loss": 0.3043,
"step": 4891
},
{
"epoch": 2.4182424916573972,
"grad_norm": 0.11901063648188627,
"learning_rate": 6.794517609371301e-06,
"loss": 0.303,
"step": 4892
},
{
"epoch": 2.418736868125077,
"grad_norm": 0.12261051781195863,
"learning_rate": 6.790829431931848e-06,
"loss": 0.2964,
"step": 4893
},
{
"epoch": 2.4192312445927575,
"grad_norm": 0.12107243189635915,
"learning_rate": 6.78714174107968e-06,
"loss": 0.2998,
"step": 4894
},
{
"epoch": 2.4197256210604374,
"grad_norm": 0.11920409598458476,
"learning_rate": 6.78345453737395e-06,
"loss": 0.2922,
"step": 4895
},
{
"epoch": 2.420219997528118,
"grad_norm": 0.12135053655272596,
"learning_rate": 6.7797678213737236e-06,
"loss": 0.309,
"step": 4896
},
{
"epoch": 2.4207143739957977,
"grad_norm": 0.11489137248816088,
"learning_rate": 6.776081593637992e-06,
"loss": 0.2796,
"step": 4897
},
{
"epoch": 2.421208750463478,
"grad_norm": 0.11715056837819601,
"learning_rate": 6.772395854725677e-06,
"loss": 0.2959,
"step": 4898
},
{
"epoch": 2.421703126931158,
"grad_norm": 0.12154402304630378,
"learning_rate": 6.768710605195624e-06,
"loss": 0.2842,
"step": 4899
},
{
"epoch": 2.4221975033988383,
"grad_norm": 0.11064198182890164,
"learning_rate": 6.765025845606609e-06,
"loss": 0.2961,
"step": 4900
},
{
"epoch": 2.4226918798665182,
"grad_norm": 0.12039058211815622,
"learning_rate": 6.761341576517326e-06,
"loss": 0.2734,
"step": 4901
},
{
"epoch": 2.4231862563341986,
"grad_norm": 0.11770380565152874,
"learning_rate": 6.757657798486405e-06,
"loss": 0.3071,
"step": 4902
},
{
"epoch": 2.4236806328018785,
"grad_norm": 0.11940689464679981,
"learning_rate": 6.753974512072387e-06,
"loss": 0.2884,
"step": 4903
},
{
"epoch": 2.424175009269559,
"grad_norm": 0.12224205205157637,
"learning_rate": 6.750291717833748e-06,
"loss": 0.3141,
"step": 4904
},
{
"epoch": 2.424669385737239,
"grad_norm": 0.11676492765508127,
"learning_rate": 6.7466094163288955e-06,
"loss": 0.3154,
"step": 4905
},
{
"epoch": 2.425163762204919,
"grad_norm": 0.11959059025125385,
"learning_rate": 6.7429276081161465e-06,
"loss": 0.3052,
"step": 4906
},
{
"epoch": 2.425658138672599,
"grad_norm": 0.12271759407505436,
"learning_rate": 6.739246293753756e-06,
"loss": 0.3067,
"step": 4907
},
{
"epoch": 2.4261525151402794,
"grad_norm": 0.11616313793631966,
"learning_rate": 6.735565473799896e-06,
"loss": 0.2969,
"step": 4908
},
{
"epoch": 2.4266468916079593,
"grad_norm": 0.11670937753906402,
"learning_rate": 6.731885148812674e-06,
"loss": 0.2825,
"step": 4909
},
{
"epoch": 2.4271412680756397,
"grad_norm": 0.11754100765517801,
"learning_rate": 6.728205319350104e-06,
"loss": 0.2877,
"step": 4910
},
{
"epoch": 2.4276356445433196,
"grad_norm": 0.11928897457957433,
"learning_rate": 6.724525985970147e-06,
"loss": 0.3207,
"step": 4911
},
{
"epoch": 2.428130021011,
"grad_norm": 0.12342584393012208,
"learning_rate": 6.720847149230678e-06,
"loss": 0.2928,
"step": 4912
},
{
"epoch": 2.42862439747868,
"grad_norm": 0.12261052061364083,
"learning_rate": 6.717168809689491e-06,
"loss": 0.3074,
"step": 4913
},
{
"epoch": 2.42911877394636,
"grad_norm": 0.12466303921572605,
"learning_rate": 6.713490967904313e-06,
"loss": 0.2942,
"step": 4914
},
{
"epoch": 2.42961315041404,
"grad_norm": 0.12194860662235876,
"learning_rate": 6.7098136244327915e-06,
"loss": 0.3179,
"step": 4915
},
{
"epoch": 2.4301075268817205,
"grad_norm": 0.1184303296011999,
"learning_rate": 6.7061367798325035e-06,
"loss": 0.3,
"step": 4916
},
{
"epoch": 2.4306019033494004,
"grad_norm": 0.12025715917347683,
"learning_rate": 6.702460434660947e-06,
"loss": 0.3073,
"step": 4917
},
{
"epoch": 2.4310962798170808,
"grad_norm": 0.12719637020382815,
"learning_rate": 6.6987845894755396e-06,
"loss": 0.2924,
"step": 4918
},
{
"epoch": 2.4315906562847607,
"grad_norm": 0.11735387555809391,
"learning_rate": 6.695109244833635e-06,
"loss": 0.2849,
"step": 4919
},
{
"epoch": 2.432085032752441,
"grad_norm": 0.11341067189433342,
"learning_rate": 6.691434401292497e-06,
"loss": 0.3002,
"step": 4920
},
{
"epoch": 2.432579409220121,
"grad_norm": 0.11716784090910928,
"learning_rate": 6.687760059409319e-06,
"loss": 0.3018,
"step": 4921
},
{
"epoch": 2.4330737856878013,
"grad_norm": 0.11849376527004664,
"learning_rate": 6.684086219741226e-06,
"loss": 0.2779,
"step": 4922
},
{
"epoch": 2.433568162155481,
"grad_norm": 0.11439003944921526,
"learning_rate": 6.680412882845256e-06,
"loss": 0.2796,
"step": 4923
},
{
"epoch": 2.4340625386231616,
"grad_norm": 0.12238715920249084,
"learning_rate": 6.676740049278376e-06,
"loss": 0.2906,
"step": 4924
},
{
"epoch": 2.4345569150908415,
"grad_norm": 0.13078716097317344,
"learning_rate": 6.673067719597477e-06,
"loss": 0.3189,
"step": 4925
},
{
"epoch": 2.435051291558522,
"grad_norm": 0.12312790471972514,
"learning_rate": 6.669395894359369e-06,
"loss": 0.2971,
"step": 4926
},
{
"epoch": 2.4355456680262018,
"grad_norm": 0.11808683909751888,
"learning_rate": 6.665724574120791e-06,
"loss": 0.3206,
"step": 4927
},
{
"epoch": 2.436040044493882,
"grad_norm": 0.12797739188726853,
"learning_rate": 6.662053759438407e-06,
"loss": 0.3108,
"step": 4928
},
{
"epoch": 2.4365344209615625,
"grad_norm": 0.12305182869629429,
"learning_rate": 6.658383450868795e-06,
"loss": 0.2862,
"step": 4929
},
{
"epoch": 2.4370287974292424,
"grad_norm": 0.11985729900503404,
"learning_rate": 6.654713648968463e-06,
"loss": 0.3072,
"step": 4930
},
{
"epoch": 2.4375231738969223,
"grad_norm": 0.12057115971475259,
"learning_rate": 6.651044354293842e-06,
"loss": 0.2983,
"step": 4931
},
{
"epoch": 2.4380175503646027,
"grad_norm": 0.11831817532176242,
"learning_rate": 6.647375567401283e-06,
"loss": 0.2913,
"step": 4932
},
{
"epoch": 2.438511926832283,
"grad_norm": 0.12314601854153943,
"learning_rate": 6.643707288847066e-06,
"loss": 0.3096,
"step": 4933
},
{
"epoch": 2.439006303299963,
"grad_norm": 0.1146536662198182,
"learning_rate": 6.640039519187388e-06,
"loss": 0.2808,
"step": 4934
},
{
"epoch": 2.439500679767643,
"grad_norm": 0.1210090939785474,
"learning_rate": 6.636372258978374e-06,
"loss": 0.2943,
"step": 4935
},
{
"epoch": 2.439995056235323,
"grad_norm": 0.11718468401187507,
"learning_rate": 6.632705508776063e-06,
"loss": 0.2796,
"step": 4936
},
{
"epoch": 2.4404894327030036,
"grad_norm": 0.13322743399297035,
"learning_rate": 6.62903926913642e-06,
"loss": 0.3044,
"step": 4937
},
{
"epoch": 2.4409838091706835,
"grad_norm": 0.11746325521337304,
"learning_rate": 6.625373540615348e-06,
"loss": 0.2919,
"step": 4938
},
{
"epoch": 2.4414781856383634,
"grad_norm": 0.11897695934130292,
"learning_rate": 6.621708323768649e-06,
"loss": 0.295,
"step": 4939
},
{
"epoch": 2.4419725621060437,
"grad_norm": 0.12052260293095059,
"learning_rate": 6.618043619152059e-06,
"loss": 0.2791,
"step": 4940
},
{
"epoch": 2.442466938573724,
"grad_norm": 0.11152057155940505,
"learning_rate": 6.614379427321238e-06,
"loss": 0.2992,
"step": 4941
},
{
"epoch": 2.442961315041404,
"grad_norm": 0.11455448001764681,
"learning_rate": 6.610715748831766e-06,
"loss": 0.2839,
"step": 4942
},
{
"epoch": 2.443455691509084,
"grad_norm": 0.11717793123638383,
"learning_rate": 6.607052584239137e-06,
"loss": 0.3009,
"step": 4943
},
{
"epoch": 2.4439500679767643,
"grad_norm": 0.11749957509417105,
"learning_rate": 6.603389934098783e-06,
"loss": 0.2876,
"step": 4944
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.1203050384015283,
"learning_rate": 6.5997277989660495e-06,
"loss": 0.2992,
"step": 4945
},
{
"epoch": 2.4449388209121246,
"grad_norm": 0.11891751401411535,
"learning_rate": 6.5960661793961985e-06,
"loss": 0.2906,
"step": 4946
},
{
"epoch": 2.445433197379805,
"grad_norm": 0.11546695922953697,
"learning_rate": 6.592405075944424e-06,
"loss": 0.3072,
"step": 4947
},
{
"epoch": 2.445927573847485,
"grad_norm": 0.1162746307026761,
"learning_rate": 6.588744489165832e-06,
"loss": 0.2959,
"step": 4948
},
{
"epoch": 2.446421950315165,
"grad_norm": 0.1188691668073944,
"learning_rate": 6.585084419615463e-06,
"loss": 0.3023,
"step": 4949
},
{
"epoch": 2.446916326782845,
"grad_norm": 0.11742397049372304,
"learning_rate": 6.581424867848266e-06,
"loss": 0.3041,
"step": 4950
},
{
"epoch": 2.4474107032505255,
"grad_norm": 0.1185891553483392,
"learning_rate": 6.577765834419119e-06,
"loss": 0.2894,
"step": 4951
},
{
"epoch": 2.4479050797182054,
"grad_norm": 0.12833663000345458,
"learning_rate": 6.5741073198828195e-06,
"loss": 0.2919,
"step": 4952
},
{
"epoch": 2.4483994561858857,
"grad_norm": 0.12545365427702496,
"learning_rate": 6.570449324794084e-06,
"loss": 0.2974,
"step": 4953
},
{
"epoch": 2.4488938326535656,
"grad_norm": 0.119114238430532,
"learning_rate": 6.566791849707551e-06,
"loss": 0.2966,
"step": 4954
},
{
"epoch": 2.449388209121246,
"grad_norm": 0.1214856167239477,
"learning_rate": 6.563134895177786e-06,
"loss": 0.2834,
"step": 4955
},
{
"epoch": 2.449882585588926,
"grad_norm": 0.11994309336600581,
"learning_rate": 6.55947846175927e-06,
"loss": 0.2841,
"step": 4956
},
{
"epoch": 2.4503769620566063,
"grad_norm": 0.12177580276651141,
"learning_rate": 6.555822550006404e-06,
"loss": 0.2901,
"step": 4957
},
{
"epoch": 2.450871338524286,
"grad_norm": 0.12072581045120723,
"learning_rate": 6.552167160473515e-06,
"loss": 0.2959,
"step": 4958
},
{
"epoch": 2.4513657149919665,
"grad_norm": 0.11779904947290257,
"learning_rate": 6.548512293714841e-06,
"loss": 0.3145,
"step": 4959
},
{
"epoch": 2.4518600914596465,
"grad_norm": 0.12606841029581514,
"learning_rate": 6.544857950284558e-06,
"loss": 0.299,
"step": 4960
},
{
"epoch": 2.452354467927327,
"grad_norm": 0.12096052149726787,
"learning_rate": 6.5412041307367455e-06,
"loss": 0.2871,
"step": 4961
},
{
"epoch": 2.4528488443950067,
"grad_norm": 0.12954023221232328,
"learning_rate": 6.537550835625411e-06,
"loss": 0.3219,
"step": 4962
},
{
"epoch": 2.453343220862687,
"grad_norm": 0.11904215691699109,
"learning_rate": 6.533898065504483e-06,
"loss": 0.3103,
"step": 4963
},
{
"epoch": 2.453837597330367,
"grad_norm": 0.1196913357176147,
"learning_rate": 6.530245820927806e-06,
"loss": 0.2849,
"step": 4964
},
{
"epoch": 2.4543319737980474,
"grad_norm": 0.12217255696934204,
"learning_rate": 6.52659410244915e-06,
"loss": 0.2942,
"step": 4965
},
{
"epoch": 2.4548263502657273,
"grad_norm": 0.11855918512901296,
"learning_rate": 6.522942910622206e-06,
"loss": 0.2953,
"step": 4966
},
{
"epoch": 2.4553207267334076,
"grad_norm": 0.11693402893991094,
"learning_rate": 6.519292246000577e-06,
"loss": 0.2983,
"step": 4967
},
{
"epoch": 2.4558151032010875,
"grad_norm": 0.11889768267987808,
"learning_rate": 6.515642109137799e-06,
"loss": 0.3059,
"step": 4968
},
{
"epoch": 2.456309479668768,
"grad_norm": 0.12150199017911606,
"learning_rate": 6.511992500587312e-06,
"loss": 0.2988,
"step": 4969
},
{
"epoch": 2.456803856136448,
"grad_norm": 0.12318211032176812,
"learning_rate": 6.5083434209024835e-06,
"loss": 0.3062,
"step": 4970
},
{
"epoch": 2.457298232604128,
"grad_norm": 0.11958762642073933,
"learning_rate": 6.504694870636612e-06,
"loss": 0.2958,
"step": 4971
},
{
"epoch": 2.457792609071808,
"grad_norm": 0.12116545674800416,
"learning_rate": 6.501046850342898e-06,
"loss": 0.2882,
"step": 4972
},
{
"epoch": 2.4582869855394884,
"grad_norm": 0.11882693414560434,
"learning_rate": 6.49739936057447e-06,
"loss": 0.2813,
"step": 4973
},
{
"epoch": 2.4587813620071683,
"grad_norm": 0.12085899157860439,
"learning_rate": 6.493752401884374e-06,
"loss": 0.3079,
"step": 4974
},
{
"epoch": 2.4592757384748487,
"grad_norm": 0.12151256864508131,
"learning_rate": 6.49010597482558e-06,
"loss": 0.2997,
"step": 4975
},
{
"epoch": 2.4597701149425286,
"grad_norm": 0.1252728552333165,
"learning_rate": 6.486460079950966e-06,
"loss": 0.2804,
"step": 4976
},
{
"epoch": 2.460264491410209,
"grad_norm": 0.11801344762393458,
"learning_rate": 6.482814717813346e-06,
"loss": 0.2734,
"step": 4977
},
{
"epoch": 2.460758867877889,
"grad_norm": 0.12123385971761741,
"learning_rate": 6.4791698889654445e-06,
"loss": 0.2939,
"step": 4978
},
{
"epoch": 2.4612532443455692,
"grad_norm": 0.12142623862821039,
"learning_rate": 6.475525593959897e-06,
"loss": 0.2788,
"step": 4979
},
{
"epoch": 2.461747620813249,
"grad_norm": 0.12008187169841265,
"learning_rate": 6.471881833349272e-06,
"loss": 0.2997,
"step": 4980
},
{
"epoch": 2.4622419972809295,
"grad_norm": 0.12082241699976722,
"learning_rate": 6.4682386076860486e-06,
"loss": 0.2878,
"step": 4981
},
{
"epoch": 2.4627363737486094,
"grad_norm": 0.1153704365783294,
"learning_rate": 6.464595917522629e-06,
"loss": 0.3044,
"step": 4982
},
{
"epoch": 2.46323075021629,
"grad_norm": 0.11970065983683181,
"learning_rate": 6.460953763411332e-06,
"loss": 0.3116,
"step": 4983
},
{
"epoch": 2.4637251266839697,
"grad_norm": 0.12017679053675141,
"learning_rate": 6.457312145904398e-06,
"loss": 0.2754,
"step": 4984
},
{
"epoch": 2.46421950315165,
"grad_norm": 0.11651099533874319,
"learning_rate": 6.453671065553979e-06,
"loss": 0.3017,
"step": 4985
},
{
"epoch": 2.46471387961933,
"grad_norm": 0.11844307582414655,
"learning_rate": 6.450030522912147e-06,
"loss": 0.2958,
"step": 4986
},
{
"epoch": 2.4652082560870103,
"grad_norm": 0.11784357488362747,
"learning_rate": 6.446390518530909e-06,
"loss": 0.3001,
"step": 4987
},
{
"epoch": 2.4657026325546902,
"grad_norm": 0.12337515513019252,
"learning_rate": 6.442751052962165e-06,
"loss": 0.297,
"step": 4988
},
{
"epoch": 2.4661970090223706,
"grad_norm": 0.11393145683627448,
"learning_rate": 6.439112126757751e-06,
"loss": 0.3165,
"step": 4989
},
{
"epoch": 2.4666913854900505,
"grad_norm": 0.19587697401577323,
"learning_rate": 6.435473740469413e-06,
"loss": 0.285,
"step": 4990
},
{
"epoch": 2.467185761957731,
"grad_norm": 0.12076785736736088,
"learning_rate": 6.43183589464882e-06,
"loss": 0.2809,
"step": 4991
},
{
"epoch": 2.467680138425411,
"grad_norm": 0.1142604754806383,
"learning_rate": 6.42819858984755e-06,
"loss": 0.273,
"step": 4992
},
{
"epoch": 2.468174514893091,
"grad_norm": 0.11845875366322405,
"learning_rate": 6.424561826617115e-06,
"loss": 0.2755,
"step": 4993
},
{
"epoch": 2.468668891360771,
"grad_norm": 0.12469027345210604,
"learning_rate": 6.420925605508933e-06,
"loss": 0.3114,
"step": 4994
},
{
"epoch": 2.4691632678284514,
"grad_norm": 0.11403099510171075,
"learning_rate": 6.41728992707434e-06,
"loss": 0.2798,
"step": 4995
},
{
"epoch": 2.4696576442961313,
"grad_norm": 0.11678500958663608,
"learning_rate": 6.413654791864592e-06,
"loss": 0.3101,
"step": 4996
},
{
"epoch": 2.4701520207638117,
"grad_norm": 0.2184457066673345,
"learning_rate": 6.410020200430862e-06,
"loss": 0.3383,
"step": 4997
},
{
"epoch": 2.4706463972314916,
"grad_norm": 0.12096892115860496,
"learning_rate": 6.406386153324247e-06,
"loss": 0.3013,
"step": 4998
},
{
"epoch": 2.471140773699172,
"grad_norm": 0.11543920016583353,
"learning_rate": 6.40275265109575e-06,
"loss": 0.2892,
"step": 4999
},
{
"epoch": 2.471635150166852,
"grad_norm": 0.1359466330717708,
"learning_rate": 6.3991196942963e-06,
"loss": 0.31,
"step": 5000
},
{
"epoch": 2.4721295266345322,
"grad_norm": 0.11480831072079349,
"learning_rate": 6.395487283476741e-06,
"loss": 0.269,
"step": 5001
},
{
"epoch": 2.472623903102212,
"grad_norm": 0.12228856587957779,
"learning_rate": 6.391855419187831e-06,
"loss": 0.3053,
"step": 5002
},
{
"epoch": 2.4731182795698925,
"grad_norm": 0.11816197160330183,
"learning_rate": 6.388224101980247e-06,
"loss": 0.2847,
"step": 5003
},
{
"epoch": 2.473612656037573,
"grad_norm": 0.12608604820497743,
"learning_rate": 6.384593332404588e-06,
"loss": 0.3139,
"step": 5004
},
{
"epoch": 2.4741070325052528,
"grad_norm": 0.12216504546411183,
"learning_rate": 6.380963111011362e-06,
"loss": 0.3116,
"step": 5005
},
{
"epoch": 2.4746014089729327,
"grad_norm": 0.12300723028301616,
"learning_rate": 6.3773334383510014e-06,
"loss": 0.2939,
"step": 5006
},
{
"epoch": 2.475095785440613,
"grad_norm": 0.12430022443444685,
"learning_rate": 6.373704314973849e-06,
"loss": 0.286,
"step": 5007
},
{
"epoch": 2.4755901619082934,
"grad_norm": 0.12287796817731605,
"learning_rate": 6.370075741430166e-06,
"loss": 0.2988,
"step": 5008
},
{
"epoch": 2.4760845383759733,
"grad_norm": 0.12151860714771338,
"learning_rate": 6.366447718270134e-06,
"loss": 0.2845,
"step": 5009
},
{
"epoch": 2.4765789148436532,
"grad_norm": 0.1194889842210467,
"learning_rate": 6.36282024604385e-06,
"loss": 0.2895,
"step": 5010
},
{
"epoch": 2.4770732913113336,
"grad_norm": 0.12662767141540954,
"learning_rate": 6.359193325301323e-06,
"loss": 0.2837,
"step": 5011
},
{
"epoch": 2.477567667779014,
"grad_norm": 0.11790478836794617,
"learning_rate": 6.355566956592478e-06,
"loss": 0.28,
"step": 5012
},
{
"epoch": 2.478062044246694,
"grad_norm": 0.12463822256418833,
"learning_rate": 6.351941140467166e-06,
"loss": 0.2916,
"step": 5013
},
{
"epoch": 2.4785564207143738,
"grad_norm": 0.12651601303162366,
"learning_rate": 6.348315877475142e-06,
"loss": 0.2989,
"step": 5014
},
{
"epoch": 2.479050797182054,
"grad_norm": 0.12522792711886638,
"learning_rate": 6.344691168166087e-06,
"loss": 0.3126,
"step": 5015
},
{
"epoch": 2.4795451736497345,
"grad_norm": 0.11611167912476102,
"learning_rate": 6.341067013089594e-06,
"loss": 0.2894,
"step": 5016
},
{
"epoch": 2.4800395501174144,
"grad_norm": 0.11933269508835097,
"learning_rate": 6.337443412795171e-06,
"loss": 0.2878,
"step": 5017
},
{
"epoch": 2.4805339265850943,
"grad_norm": 0.12709630637311242,
"learning_rate": 6.333820367832242e-06,
"loss": 0.3055,
"step": 5018
},
{
"epoch": 2.4810283030527747,
"grad_norm": 0.12661799833339601,
"learning_rate": 6.330197878750142e-06,
"loss": 0.2892,
"step": 5019
},
{
"epoch": 2.481522679520455,
"grad_norm": 0.13577577263677357,
"learning_rate": 6.326575946098141e-06,
"loss": 0.2965,
"step": 5020
},
{
"epoch": 2.482017055988135,
"grad_norm": 0.12048568401781923,
"learning_rate": 6.322954570425399e-06,
"loss": 0.3007,
"step": 5021
},
{
"epoch": 2.4825114324558153,
"grad_norm": 0.12507269337217808,
"learning_rate": 6.319333752281009e-06,
"loss": 0.3126,
"step": 5022
},
{
"epoch": 2.483005808923495,
"grad_norm": 0.12652846527926376,
"learning_rate": 6.315713492213973e-06,
"loss": 0.2902,
"step": 5023
},
{
"epoch": 2.4835001853911756,
"grad_norm": 0.11577488967766987,
"learning_rate": 6.3120937907732104e-06,
"loss": 0.2872,
"step": 5024
},
{
"epoch": 2.4839945618588555,
"grad_norm": 0.11926069688374868,
"learning_rate": 6.308474648507547e-06,
"loss": 0.286,
"step": 5025
},
{
"epoch": 2.484488938326536,
"grad_norm": 0.123771978231262,
"learning_rate": 6.3048560659657435e-06,
"loss": 0.3171,
"step": 5026
},
{
"epoch": 2.4849833147942157,
"grad_norm": 0.128310050187285,
"learning_rate": 6.301238043696458e-06,
"loss": 0.288,
"step": 5027
},
{
"epoch": 2.485477691261896,
"grad_norm": 0.11545374374747729,
"learning_rate": 6.2976205822482696e-06,
"loss": 0.3084,
"step": 5028
},
{
"epoch": 2.485972067729576,
"grad_norm": 0.1202844153376381,
"learning_rate": 6.2940036821696715e-06,
"loss": 0.2867,
"step": 5029
},
{
"epoch": 2.4864664441972564,
"grad_norm": 0.12025481825969407,
"learning_rate": 6.290387344009072e-06,
"loss": 0.2932,
"step": 5030
},
{
"epoch": 2.4869608206649363,
"grad_norm": 0.11842997878844087,
"learning_rate": 6.286771568314798e-06,
"loss": 0.2913,
"step": 5031
},
{
"epoch": 2.4874551971326166,
"grad_norm": 0.12569622705010602,
"learning_rate": 6.283156355635087e-06,
"loss": 0.2876,
"step": 5032
},
{
"epoch": 2.4879495736002966,
"grad_norm": 0.12344515666087573,
"learning_rate": 6.279541706518091e-06,
"loss": 0.289,
"step": 5033
},
{
"epoch": 2.488443950067977,
"grad_norm": 0.12223842307844186,
"learning_rate": 6.27592762151188e-06,
"loss": 0.303,
"step": 5034
},
{
"epoch": 2.488938326535657,
"grad_norm": 0.1210734859821162,
"learning_rate": 6.272314101164431e-06,
"loss": 0.3001,
"step": 5035
},
{
"epoch": 2.489432703003337,
"grad_norm": 0.12559131549116728,
"learning_rate": 6.268701146023644e-06,
"loss": 0.3001,
"step": 5036
},
{
"epoch": 2.489927079471017,
"grad_norm": 0.12063850760168077,
"learning_rate": 6.265088756637328e-06,
"loss": 0.2956,
"step": 5037
},
{
"epoch": 2.4904214559386975,
"grad_norm": 0.11767696106476883,
"learning_rate": 6.26147693355321e-06,
"loss": 0.2859,
"step": 5038
},
{
"epoch": 2.4909158324063774,
"grad_norm": 0.12145225733360544,
"learning_rate": 6.257865677318928e-06,
"loss": 0.298,
"step": 5039
},
{
"epoch": 2.4914102088740577,
"grad_norm": 0.12857274304300528,
"learning_rate": 6.254254988482036e-06,
"loss": 0.284,
"step": 5040
},
{
"epoch": 2.4919045853417376,
"grad_norm": 0.1314549767068553,
"learning_rate": 6.250644867589997e-06,
"loss": 0.3014,
"step": 5041
},
{
"epoch": 2.492398961809418,
"grad_norm": 0.1283747130770994,
"learning_rate": 6.247035315190198e-06,
"loss": 0.299,
"step": 5042
},
{
"epoch": 2.492893338277098,
"grad_norm": 0.13097308239009925,
"learning_rate": 6.243426331829934e-06,
"loss": 0.2891,
"step": 5043
},
{
"epoch": 2.4933877147447783,
"grad_norm": 0.12460177021466437,
"learning_rate": 6.239817918056406e-06,
"loss": 0.2731,
"step": 5044
},
{
"epoch": 2.493882091212458,
"grad_norm": 0.12045789936131783,
"learning_rate": 6.236210074416743e-06,
"loss": 0.2837,
"step": 5045
},
{
"epoch": 2.4943764676801385,
"grad_norm": 0.11741050385780026,
"learning_rate": 6.232602801457979e-06,
"loss": 0.2779,
"step": 5046
},
{
"epoch": 2.4948708441478185,
"grad_norm": 0.12119902722347563,
"learning_rate": 6.228996099727058e-06,
"loss": 0.2862,
"step": 5047
},
{
"epoch": 2.495365220615499,
"grad_norm": 0.11636471269923207,
"learning_rate": 6.225389969770851e-06,
"loss": 0.3001,
"step": 5048
},
{
"epoch": 2.4958595970831787,
"grad_norm": 0.11821022940033163,
"learning_rate": 6.221784412136129e-06,
"loss": 0.3171,
"step": 5049
},
{
"epoch": 2.496353973550859,
"grad_norm": 0.12788710923113808,
"learning_rate": 6.218179427369585e-06,
"loss": 0.3105,
"step": 5050
},
{
"epoch": 2.496848350018539,
"grad_norm": 0.11384678437297904,
"learning_rate": 6.214575016017816e-06,
"loss": 0.284,
"step": 5051
},
{
"epoch": 2.4973427264862194,
"grad_norm": 0.120953451663072,
"learning_rate": 6.210971178627335e-06,
"loss": 0.298,
"step": 5052
},
{
"epoch": 2.4978371029538993,
"grad_norm": 0.11801039424106063,
"learning_rate": 6.2073679157445824e-06,
"loss": 0.2988,
"step": 5053
},
{
"epoch": 2.4983314794215796,
"grad_norm": 0.12469784112544127,
"learning_rate": 6.203765227915887e-06,
"loss": 0.3154,
"step": 5054
},
{
"epoch": 2.4988258558892595,
"grad_norm": 0.11696747130240231,
"learning_rate": 6.20016311568751e-06,
"loss": 0.3123,
"step": 5055
},
{
"epoch": 2.49932023235694,
"grad_norm": 0.11952495141615742,
"learning_rate": 6.196561579605616e-06,
"loss": 0.285,
"step": 5056
},
{
"epoch": 2.49981460882462,
"grad_norm": 0.1206429870325997,
"learning_rate": 6.192960620216284e-06,
"loss": 0.2733,
"step": 5057
},
{
"epoch": 2.5003089852923,
"grad_norm": 0.12886430471678598,
"learning_rate": 6.189360238065501e-06,
"loss": 0.3026,
"step": 5058
},
{
"epoch": 2.50080336175998,
"grad_norm": 0.7541453648640809,
"learning_rate": 6.185760433699179e-06,
"loss": 0.2854,
"step": 5059
},
{
"epoch": 2.5012977382276604,
"grad_norm": 0.1231039818973836,
"learning_rate": 6.182161207663136e-06,
"loss": 0.2926,
"step": 5060
},
{
"epoch": 2.5012977382276604,
"eval_loss": 0.5084992051124573,
"eval_runtime": 100.9255,
"eval_samples_per_second": 300.757,
"eval_steps_per_second": 37.602,
"step": 5060
},
{
"epoch": 2.5017921146953404,
"grad_norm": 0.11994762459740865,
"learning_rate": 6.1785625605030945e-06,
"loss": 0.2883,
"step": 5061
},
{
"epoch": 2.5022864911630207,
"grad_norm": 0.11788246546320102,
"learning_rate": 6.174964492764697e-06,
"loss": 0.3033,
"step": 5062
},
{
"epoch": 2.5027808676307006,
"grad_norm": 0.12258881553058934,
"learning_rate": 6.171367004993497e-06,
"loss": 0.3034,
"step": 5063
},
{
"epoch": 2.503275244098381,
"grad_norm": 0.12453767667158337,
"learning_rate": 6.167770097734963e-06,
"loss": 0.2944,
"step": 5064
},
{
"epoch": 2.503769620566061,
"grad_norm": 0.12075815239058302,
"learning_rate": 6.164173771534471e-06,
"loss": 0.3027,
"step": 5065
},
{
"epoch": 2.5042639970337413,
"grad_norm": 0.12002672340212932,
"learning_rate": 6.160578026937312e-06,
"loss": 0.3248,
"step": 5066
},
{
"epoch": 2.5047583735014216,
"grad_norm": 0.13057390514304704,
"learning_rate": 6.156982864488683e-06,
"loss": 0.2869,
"step": 5067
},
{
"epoch": 2.5052527499691015,
"grad_norm": 0.11908795201071183,
"learning_rate": 6.153388284733695e-06,
"loss": 0.3025,
"step": 5068
},
{
"epoch": 2.5057471264367814,
"grad_norm": 0.12954573474292447,
"learning_rate": 6.149794288217383e-06,
"loss": 0.3147,
"step": 5069
},
{
"epoch": 2.506241502904462,
"grad_norm": 0.13110201540575772,
"learning_rate": 6.146200875484676e-06,
"loss": 0.3211,
"step": 5070
},
{
"epoch": 2.506735879372142,
"grad_norm": 0.12175297236633785,
"learning_rate": 6.142608047080422e-06,
"loss": 0.2885,
"step": 5071
},
{
"epoch": 2.507230255839822,
"grad_norm": 0.115400371051522,
"learning_rate": 6.1390158035493795e-06,
"loss": 0.2945,
"step": 5072
},
{
"epoch": 2.507724632307502,
"grad_norm": 0.11836555398200287,
"learning_rate": 6.135424145436224e-06,
"loss": 0.3209,
"step": 5073
},
{
"epoch": 2.5082190087751823,
"grad_norm": 0.1178607775332171,
"learning_rate": 6.131833073285525e-06,
"loss": 0.2811,
"step": 5074
},
{
"epoch": 2.5087133852428627,
"grad_norm": 0.1186113276094088,
"learning_rate": 6.128242587641788e-06,
"loss": 0.2812,
"step": 5075
},
{
"epoch": 2.5092077617105426,
"grad_norm": 0.12092957340587608,
"learning_rate": 6.124652689049414e-06,
"loss": 0.2946,
"step": 5076
},
{
"epoch": 2.5097021381782225,
"grad_norm": 0.11892617637663942,
"learning_rate": 6.121063378052714e-06,
"loss": 0.3115,
"step": 5077
},
{
"epoch": 2.510196514645903,
"grad_norm": 0.11953724204873596,
"learning_rate": 6.1174746551959165e-06,
"loss": 0.3048,
"step": 5078
},
{
"epoch": 2.5106908911135832,
"grad_norm": 0.11896227370890444,
"learning_rate": 6.113886521023153e-06,
"loss": 0.3197,
"step": 5079
},
{
"epoch": 2.511185267581263,
"grad_norm": 0.12238170864528551,
"learning_rate": 6.110298976078478e-06,
"loss": 0.3081,
"step": 5080
},
{
"epoch": 2.511679644048943,
"grad_norm": 0.13054792187234818,
"learning_rate": 6.106712020905846e-06,
"loss": 0.305,
"step": 5081
},
{
"epoch": 2.5121740205166234,
"grad_norm": 0.12773656398986624,
"learning_rate": 6.103125656049127e-06,
"loss": 0.2994,
"step": 5082
},
{
"epoch": 2.5126683969843038,
"grad_norm": 0.1221448690469628,
"learning_rate": 6.099539882052099e-06,
"loss": 0.2966,
"step": 5083
},
{
"epoch": 2.5131627734519837,
"grad_norm": 0.11987407982367014,
"learning_rate": 6.0959546994584505e-06,
"loss": 0.3055,
"step": 5084
},
{
"epoch": 2.5136571499196636,
"grad_norm": 0.11478480326400838,
"learning_rate": 6.092370108811779e-06,
"loss": 0.2933,
"step": 5085
},
{
"epoch": 2.514151526387344,
"grad_norm": 0.11854318805048887,
"learning_rate": 6.0887861106556e-06,
"loss": 0.3293,
"step": 5086
},
{
"epoch": 2.5146459028550243,
"grad_norm": 0.11529168091196133,
"learning_rate": 6.085202705533331e-06,
"loss": 0.2967,
"step": 5087
},
{
"epoch": 2.5151402793227042,
"grad_norm": 0.12332100601282475,
"learning_rate": 6.081619893988302e-06,
"loss": 0.29,
"step": 5088
},
{
"epoch": 2.515634655790384,
"grad_norm": 0.12067364423663779,
"learning_rate": 6.078037676563755e-06,
"loss": 0.3048,
"step": 5089
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.11442181964966427,
"learning_rate": 6.074456053802835e-06,
"loss": 0.3046,
"step": 5090
},
{
"epoch": 2.516623408725745,
"grad_norm": 0.1331357107564662,
"learning_rate": 6.070875026248608e-06,
"loss": 0.293,
"step": 5091
},
{
"epoch": 2.5171177851934248,
"grad_norm": 0.12264438269619836,
"learning_rate": 6.067294594444044e-06,
"loss": 0.2813,
"step": 5092
},
{
"epoch": 2.5176121616611047,
"grad_norm": 0.11696140432755885,
"learning_rate": 6.063714758932018e-06,
"loss": 0.3036,
"step": 5093
},
{
"epoch": 2.518106538128785,
"grad_norm": 0.11655062739520572,
"learning_rate": 6.060135520255319e-06,
"loss": 0.2854,
"step": 5094
},
{
"epoch": 2.5186009145964654,
"grad_norm": 0.11788715188022571,
"learning_rate": 6.0565568789566486e-06,
"loss": 0.2957,
"step": 5095
},
{
"epoch": 2.5190952910641453,
"grad_norm": 0.11916467174013184,
"learning_rate": 6.0529788355786115e-06,
"loss": 0.2873,
"step": 5096
},
{
"epoch": 2.5195896675318252,
"grad_norm": 0.12642915838236377,
"learning_rate": 6.049401390663729e-06,
"loss": 0.3061,
"step": 5097
},
{
"epoch": 2.5200840439995056,
"grad_norm": 0.11756735987154335,
"learning_rate": 6.045824544754425e-06,
"loss": 0.2819,
"step": 5098
},
{
"epoch": 2.520578420467186,
"grad_norm": 0.11712017703422688,
"learning_rate": 6.0422482983930385e-06,
"loss": 0.3167,
"step": 5099
},
{
"epoch": 2.521072796934866,
"grad_norm": 0.12067234212783066,
"learning_rate": 6.038672652121809e-06,
"loss": 0.2842,
"step": 5100
},
{
"epoch": 2.5215671734025458,
"grad_norm": 0.12200816007043368,
"learning_rate": 6.035097606482889e-06,
"loss": 0.3015,
"step": 5101
},
{
"epoch": 2.522061549870226,
"grad_norm": 0.11907264687226463,
"learning_rate": 6.0315231620183515e-06,
"loss": 0.3022,
"step": 5102
},
{
"epoch": 2.5225559263379065,
"grad_norm": 0.21134591171387723,
"learning_rate": 6.027949319270159e-06,
"loss": 0.296,
"step": 5103
},
{
"epoch": 2.5230503028055864,
"grad_norm": 0.1169488290895196,
"learning_rate": 6.0243760787801945e-06,
"loss": 0.2924,
"step": 5104
},
{
"epoch": 2.5235446792732668,
"grad_norm": 0.12790570831230402,
"learning_rate": 6.020803441090246e-06,
"loss": 0.2788,
"step": 5105
},
{
"epoch": 2.5240390557409467,
"grad_norm": 0.11953112365865677,
"learning_rate": 6.017231406742015e-06,
"loss": 0.2923,
"step": 5106
},
{
"epoch": 2.524533432208627,
"grad_norm": 0.12449807333806825,
"learning_rate": 6.013659976277099e-06,
"loss": 0.3246,
"step": 5107
},
{
"epoch": 2.525027808676307,
"grad_norm": 0.11915643182981245,
"learning_rate": 6.010089150237022e-06,
"loss": 0.3089,
"step": 5108
},
{
"epoch": 2.5255221851439873,
"grad_norm": 0.12311833427009011,
"learning_rate": 6.006518929163205e-06,
"loss": 0.3119,
"step": 5109
},
{
"epoch": 2.526016561611667,
"grad_norm": 0.128963015595166,
"learning_rate": 6.002949313596977e-06,
"loss": 0.3031,
"step": 5110
},
{
"epoch": 2.5265109380793476,
"grad_norm": 0.12026989866784721,
"learning_rate": 5.999380304079577e-06,
"loss": 0.2896,
"step": 5111
},
{
"epoch": 2.5270053145470275,
"grad_norm": 0.12155983216383587,
"learning_rate": 5.995811901152151e-06,
"loss": 0.2972,
"step": 5112
},
{
"epoch": 2.527499691014708,
"grad_norm": 0.1218372999378999,
"learning_rate": 5.99224410535576e-06,
"loss": 0.2815,
"step": 5113
},
{
"epoch": 2.5279940674823878,
"grad_norm": 0.11608665318198083,
"learning_rate": 5.9886769172313645e-06,
"loss": 0.2956,
"step": 5114
},
{
"epoch": 2.528488443950068,
"grad_norm": 0.12326203512681404,
"learning_rate": 5.985110337319835e-06,
"loss": 0.2936,
"step": 5115
},
{
"epoch": 2.528982820417748,
"grad_norm": 0.11746850757594382,
"learning_rate": 5.981544366161953e-06,
"loss": 0.2989,
"step": 5116
},
{
"epoch": 2.5294771968854284,
"grad_norm": 0.12620792746366943,
"learning_rate": 5.977979004298403e-06,
"loss": 0.301,
"step": 5117
},
{
"epoch": 2.5299715733531083,
"grad_norm": 0.1148782450216972,
"learning_rate": 5.974414252269778e-06,
"loss": 0.2836,
"step": 5118
},
{
"epoch": 2.5304659498207887,
"grad_norm": 0.12080524469461429,
"learning_rate": 5.970850110616584e-06,
"loss": 0.3023,
"step": 5119
},
{
"epoch": 2.5309603262884686,
"grad_norm": 0.1270767229684479,
"learning_rate": 5.967286579879228e-06,
"loss": 0.3102,
"step": 5120
},
{
"epoch": 2.531454702756149,
"grad_norm": 0.12376091810280489,
"learning_rate": 5.963723660598029e-06,
"loss": 0.2884,
"step": 5121
},
{
"epoch": 2.531949079223829,
"grad_norm": 0.11413059158407866,
"learning_rate": 5.960161353313207e-06,
"loss": 0.3287,
"step": 5122
},
{
"epoch": 2.532443455691509,
"grad_norm": 0.12146260960837965,
"learning_rate": 5.9565996585648965e-06,
"loss": 0.2948,
"step": 5123
},
{
"epoch": 2.532937832159189,
"grad_norm": 0.11977746543337349,
"learning_rate": 5.953038576893135e-06,
"loss": 0.2879,
"step": 5124
},
{
"epoch": 2.5334322086268695,
"grad_norm": 0.12046168501064262,
"learning_rate": 5.949478108837872e-06,
"loss": 0.2817,
"step": 5125
},
{
"epoch": 2.5339265850945494,
"grad_norm": 0.12937267873598304,
"learning_rate": 5.945918254938953e-06,
"loss": 0.3124,
"step": 5126
},
{
"epoch": 2.5344209615622297,
"grad_norm": 0.12117900423932806,
"learning_rate": 5.942359015736141e-06,
"loss": 0.2783,
"step": 5127
},
{
"epoch": 2.5349153380299096,
"grad_norm": 0.11929257937056395,
"learning_rate": 5.938800391769101e-06,
"loss": 0.2926,
"step": 5128
},
{
"epoch": 2.53540971449759,
"grad_norm": 0.11675563687136874,
"learning_rate": 5.935242383577404e-06,
"loss": 0.297,
"step": 5129
},
{
"epoch": 2.53590409096527,
"grad_norm": 0.11970764392850072,
"learning_rate": 5.931684991700535e-06,
"loss": 0.2844,
"step": 5130
},
{
"epoch": 2.5363984674329503,
"grad_norm": 0.11969190188192093,
"learning_rate": 5.928128216677875e-06,
"loss": 0.2992,
"step": 5131
},
{
"epoch": 2.53689284390063,
"grad_norm": 0.11928836007262382,
"learning_rate": 5.924572059048721e-06,
"loss": 0.2976,
"step": 5132
},
{
"epoch": 2.5373872203683105,
"grad_norm": 0.12208388236473318,
"learning_rate": 5.9210165193522675e-06,
"loss": 0.2911,
"step": 5133
},
{
"epoch": 2.5378815968359905,
"grad_norm": 0.11411799022708888,
"learning_rate": 5.917461598127616e-06,
"loss": 0.2794,
"step": 5134
},
{
"epoch": 2.538375973303671,
"grad_norm": 0.11955932164196492,
"learning_rate": 5.913907295913791e-06,
"loss": 0.2891,
"step": 5135
},
{
"epoch": 2.5388703497713507,
"grad_norm": 0.1202750199172695,
"learning_rate": 5.9103536132497e-06,
"loss": 0.2928,
"step": 5136
},
{
"epoch": 2.539364726239031,
"grad_norm": 0.11712114487129646,
"learning_rate": 5.9068005506741675e-06,
"loss": 0.2894,
"step": 5137
},
{
"epoch": 2.539859102706711,
"grad_norm": 0.11602220363599765,
"learning_rate": 5.903248108725925e-06,
"loss": 0.2887,
"step": 5138
},
{
"epoch": 2.5403534791743914,
"grad_norm": 0.11564147553818065,
"learning_rate": 5.8996962879436085e-06,
"loss": 0.2768,
"step": 5139
},
{
"epoch": 2.5408478556420713,
"grad_norm": 0.11665855389624383,
"learning_rate": 5.896145088865753e-06,
"loss": 0.2689,
"step": 5140
},
{
"epoch": 2.5413422321097516,
"grad_norm": 1.267970780672067,
"learning_rate": 5.892594512030814e-06,
"loss": 0.3488,
"step": 5141
},
{
"epoch": 2.541836608577432,
"grad_norm": 0.12023976303779131,
"learning_rate": 5.889044557977144e-06,
"loss": 0.2976,
"step": 5142
},
{
"epoch": 2.542330985045112,
"grad_norm": 0.11691206867174417,
"learning_rate": 5.885495227242995e-06,
"loss": 0.2907,
"step": 5143
},
{
"epoch": 2.542825361512792,
"grad_norm": 0.11876082154665606,
"learning_rate": 5.881946520366534e-06,
"loss": 0.294,
"step": 5144
},
{
"epoch": 2.543319737980472,
"grad_norm": 0.12448365023590495,
"learning_rate": 5.878398437885828e-06,
"loss": 0.2794,
"step": 5145
},
{
"epoch": 2.5438141144481525,
"grad_norm": 0.11964471463684481,
"learning_rate": 5.8748509803388554e-06,
"loss": 0.3139,
"step": 5146
},
{
"epoch": 2.5443084909158324,
"grad_norm": 0.12360305595451576,
"learning_rate": 5.8713041482634936e-06,
"loss": 0.2905,
"step": 5147
},
{
"epoch": 2.5448028673835124,
"grad_norm": 1.519978518052185,
"learning_rate": 5.867757942197531e-06,
"loss": 0.3284,
"step": 5148
},
{
"epoch": 2.5452972438511927,
"grad_norm": 0.16509927540579997,
"learning_rate": 5.864212362678651e-06,
"loss": 0.3306,
"step": 5149
},
{
"epoch": 2.545791620318873,
"grad_norm": 0.13348695697996435,
"learning_rate": 5.860667410244448e-06,
"loss": 0.2795,
"step": 5150
},
{
"epoch": 2.546285996786553,
"grad_norm": 0.13497580554773486,
"learning_rate": 5.857123085432432e-06,
"loss": 0.3085,
"step": 5151
},
{
"epoch": 2.546780373254233,
"grad_norm": 0.1549374794752338,
"learning_rate": 5.85357938878e-06,
"loss": 0.3081,
"step": 5152
},
{
"epoch": 2.5472747497219133,
"grad_norm": 0.15396218039985243,
"learning_rate": 5.850036320824462e-06,
"loss": 0.2982,
"step": 5153
},
{
"epoch": 2.5477691261895936,
"grad_norm": 0.1561990251777467,
"learning_rate": 5.846493882103035e-06,
"loss": 0.2929,
"step": 5154
},
{
"epoch": 2.5482635026572735,
"grad_norm": 0.14884586457222546,
"learning_rate": 5.842952073152837e-06,
"loss": 0.3169,
"step": 5155
},
{
"epoch": 2.5487578791249534,
"grad_norm": 0.14456024698546147,
"learning_rate": 5.839410894510884e-06,
"loss": 0.2884,
"step": 5156
},
{
"epoch": 2.549252255592634,
"grad_norm": 0.13009109612251726,
"learning_rate": 5.835870346714114e-06,
"loss": 0.2883,
"step": 5157
},
{
"epoch": 2.549746632060314,
"grad_norm": 0.12345025359180026,
"learning_rate": 5.832330430299353e-06,
"loss": 0.2783,
"step": 5158
},
{
"epoch": 2.550241008527994,
"grad_norm": 0.12416302009296935,
"learning_rate": 5.828791145803343e-06,
"loss": 0.3036,
"step": 5159
},
{
"epoch": 2.550735384995674,
"grad_norm": 0.12675892531509242,
"learning_rate": 5.8252524937627204e-06,
"loss": 0.3032,
"step": 5160
},
{
"epoch": 2.5512297614633543,
"grad_norm": 0.12095111510967521,
"learning_rate": 5.821714474714022e-06,
"loss": 0.2928,
"step": 5161
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.11940695483452876,
"learning_rate": 5.818177089193713e-06,
"loss": 0.2954,
"step": 5162
},
{
"epoch": 2.5522185143987146,
"grad_norm": 0.12102477150387479,
"learning_rate": 5.814640337738137e-06,
"loss": 0.2974,
"step": 5163
},
{
"epoch": 2.5527128908663945,
"grad_norm": 0.11810677511818737,
"learning_rate": 5.811104220883547e-06,
"loss": 0.2763,
"step": 5164
},
{
"epoch": 2.553207267334075,
"grad_norm": 0.12680484902386613,
"learning_rate": 5.807568739166109e-06,
"loss": 0.3154,
"step": 5165
},
{
"epoch": 2.5537016438017552,
"grad_norm": 0.12077586020402142,
"learning_rate": 5.8040338931218845e-06,
"loss": 0.2956,
"step": 5166
},
{
"epoch": 2.554196020269435,
"grad_norm": 0.12107459429443021,
"learning_rate": 5.80049968328684e-06,
"loss": 0.2886,
"step": 5167
},
{
"epoch": 2.554690396737115,
"grad_norm": 0.12978725355037635,
"learning_rate": 5.796966110196851e-06,
"loss": 0.3257,
"step": 5168
},
{
"epoch": 2.5551847732047954,
"grad_norm": 0.11959917093462769,
"learning_rate": 5.793433174387686e-06,
"loss": 0.2893,
"step": 5169
},
{
"epoch": 2.555679149672476,
"grad_norm": 0.11703060647689041,
"learning_rate": 5.7899008763950295e-06,
"loss": 0.293,
"step": 5170
},
{
"epoch": 2.5561735261401557,
"grad_norm": 0.12480265514781402,
"learning_rate": 5.7863692167544585e-06,
"loss": 0.2968,
"step": 5171
},
{
"epoch": 2.5566679026078356,
"grad_norm": 0.1196268532045587,
"learning_rate": 5.782838196001454e-06,
"loss": 0.2917,
"step": 5172
},
{
"epoch": 2.557162279075516,
"grad_norm": 0.12338327301085635,
"learning_rate": 5.779307814671408e-06,
"loss": 0.3126,
"step": 5173
},
{
"epoch": 2.5576566555431963,
"grad_norm": 0.126267871861736,
"learning_rate": 5.7757780732996136e-06,
"loss": 0.2863,
"step": 5174
},
{
"epoch": 2.5581510320108762,
"grad_norm": 0.11697096397642023,
"learning_rate": 5.772248972421257e-06,
"loss": 0.2745,
"step": 5175
},
{
"epoch": 2.558645408478556,
"grad_norm": 0.11327840433968207,
"learning_rate": 5.768720512571444e-06,
"loss": 0.2709,
"step": 5176
},
{
"epoch": 2.5591397849462365,
"grad_norm": 0.11272938431762897,
"learning_rate": 5.765192694285169e-06,
"loss": 0.2991,
"step": 5177
},
{
"epoch": 2.559634161413917,
"grad_norm": 0.12039152041227312,
"learning_rate": 5.761665518097323e-06,
"loss": 0.2845,
"step": 5178
},
{
"epoch": 2.5601285378815968,
"grad_norm": 0.11668936979687371,
"learning_rate": 5.758138984542731e-06,
"loss": 0.2976,
"step": 5179
},
{
"epoch": 2.560622914349277,
"grad_norm": 0.1209923713919039,
"learning_rate": 5.75461309415609e-06,
"loss": 0.2939,
"step": 5180
},
{
"epoch": 2.561117290816957,
"grad_norm": 0.11917428186208219,
"learning_rate": 5.751087847472005e-06,
"loss": 0.2873,
"step": 5181
},
{
"epoch": 2.5616116672846374,
"grad_norm": 0.12246090360999376,
"learning_rate": 5.747563245024999e-06,
"loss": 0.3063,
"step": 5182
},
{
"epoch": 2.5621060437523173,
"grad_norm": 0.12174348033286897,
"learning_rate": 5.744039287349474e-06,
"loss": 0.2942,
"step": 5183
},
{
"epoch": 2.5626004202199977,
"grad_norm": 0.12054625847142383,
"learning_rate": 5.740515974979755e-06,
"loss": 0.2965,
"step": 5184
},
{
"epoch": 2.5630947966876776,
"grad_norm": 0.11994487835206687,
"learning_rate": 5.736993308450061e-06,
"loss": 0.2797,
"step": 5185
},
{
"epoch": 2.563589173155358,
"grad_norm": 0.11377309311122236,
"learning_rate": 5.73347128829451e-06,
"loss": 0.2905,
"step": 5186
},
{
"epoch": 2.564083549623038,
"grad_norm": 0.11713581751653752,
"learning_rate": 5.72994991504712e-06,
"loss": 0.2776,
"step": 5187
},
{
"epoch": 2.564577926090718,
"grad_norm": 0.11867259656817744,
"learning_rate": 5.726429189241827e-06,
"loss": 0.279,
"step": 5188
},
{
"epoch": 2.565072302558398,
"grad_norm": 0.11898054389194826,
"learning_rate": 5.722909111412447e-06,
"loss": 0.2971,
"step": 5189
},
{
"epoch": 2.5655666790260785,
"grad_norm": 0.12366512404085714,
"learning_rate": 5.719389682092712e-06,
"loss": 0.2841,
"step": 5190
},
{
"epoch": 2.5660610554937584,
"grad_norm": 0.12315314830982287,
"learning_rate": 5.715870901816256e-06,
"loss": 0.2852,
"step": 5191
},
{
"epoch": 2.5665554319614388,
"grad_norm": 0.12196519949561417,
"learning_rate": 5.712352771116605e-06,
"loss": 0.3078,
"step": 5192
},
{
"epoch": 2.5670498084291187,
"grad_norm": 0.11933588160471381,
"learning_rate": 5.708835290527197e-06,
"loss": 0.309,
"step": 5193
},
{
"epoch": 2.567544184896799,
"grad_norm": 0.34987133101990736,
"learning_rate": 5.705318460581359e-06,
"loss": 0.2833,
"step": 5194
},
{
"epoch": 2.568038561364479,
"grad_norm": 0.12531830628978896,
"learning_rate": 5.701802281812338e-06,
"loss": 0.2989,
"step": 5195
},
{
"epoch": 2.5685329378321593,
"grad_norm": 0.13171675002605837,
"learning_rate": 5.698286754753258e-06,
"loss": 0.2895,
"step": 5196
},
{
"epoch": 2.569027314299839,
"grad_norm": 0.12321188032545648,
"learning_rate": 5.69477187993717e-06,
"loss": 0.328,
"step": 5197
},
{
"epoch": 2.5695216907675196,
"grad_norm": 0.13083058173514303,
"learning_rate": 5.691257657897003e-06,
"loss": 0.3162,
"step": 5198
},
{
"epoch": 2.5700160672351995,
"grad_norm": 0.11854249888584438,
"learning_rate": 5.687744089165604e-06,
"loss": 0.2801,
"step": 5199
},
{
"epoch": 2.57051044370288,
"grad_norm": 0.12419161142933885,
"learning_rate": 5.68423117427571e-06,
"loss": 0.2971,
"step": 5200
},
{
"epoch": 2.5710048201705598,
"grad_norm": 0.12330498768341507,
"learning_rate": 5.680718913759964e-06,
"loss": 0.2888,
"step": 5201
},
{
"epoch": 2.57149919663824,
"grad_norm": 0.12064939184539612,
"learning_rate": 5.677207308150916e-06,
"loss": 0.2826,
"step": 5202
},
{
"epoch": 2.57199357310592,
"grad_norm": 0.11406667986070125,
"learning_rate": 5.673696357981002e-06,
"loss": 0.3049,
"step": 5203
},
{
"epoch": 2.5724879495736004,
"grad_norm": 0.12114044513009727,
"learning_rate": 5.670186063782566e-06,
"loss": 0.3071,
"step": 5204
},
{
"epoch": 2.5729823260412803,
"grad_norm": 0.12371901362816314,
"learning_rate": 5.666676426087855e-06,
"loss": 0.3095,
"step": 5205
},
{
"epoch": 2.5734767025089607,
"grad_norm": 0.12228406380625531,
"learning_rate": 5.663167445429019e-06,
"loss": 0.2942,
"step": 5206
},
{
"epoch": 2.5739710789766406,
"grad_norm": 0.11947646835008831,
"learning_rate": 5.659659122338092e-06,
"loss": 0.3243,
"step": 5207
},
{
"epoch": 2.574465455444321,
"grad_norm": 0.12244071055527578,
"learning_rate": 5.656151457347034e-06,
"loss": 0.2874,
"step": 5208
},
{
"epoch": 2.574959831912001,
"grad_norm": 0.11942180088713565,
"learning_rate": 5.652644450987685e-06,
"loss": 0.2886,
"step": 5209
},
{
"epoch": 2.575454208379681,
"grad_norm": 0.12005657947679897,
"learning_rate": 5.649138103791787e-06,
"loss": 0.276,
"step": 5210
},
{
"epoch": 2.575948584847361,
"grad_norm": 0.12008212067288315,
"learning_rate": 5.6456324162909885e-06,
"loss": 0.3121,
"step": 5211
},
{
"epoch": 2.5764429613150415,
"grad_norm": 0.11626785029407781,
"learning_rate": 5.642127389016842e-06,
"loss": 0.2995,
"step": 5212
},
{
"epoch": 2.5769373377827214,
"grad_norm": 0.11510947873431826,
"learning_rate": 5.638623022500786e-06,
"loss": 0.2972,
"step": 5213
},
{
"epoch": 2.5774317142504017,
"grad_norm": 0.11446826508419329,
"learning_rate": 5.635119317274174e-06,
"loss": 0.2838,
"step": 5214
},
{
"epoch": 2.5779260907180817,
"grad_norm": 0.12063403601664943,
"learning_rate": 5.631616273868242e-06,
"loss": 0.2764,
"step": 5215
},
{
"epoch": 2.578420467185762,
"grad_norm": 0.11624700504718069,
"learning_rate": 5.628113892814142e-06,
"loss": 0.2949,
"step": 5216
},
{
"epoch": 2.5789148436534424,
"grad_norm": 0.11910378526083365,
"learning_rate": 5.624612174642922e-06,
"loss": 0.2911,
"step": 5217
},
{
"epoch": 2.5794092201211223,
"grad_norm": 0.12003567490686642,
"learning_rate": 5.621111119885521e-06,
"loss": 0.2831,
"step": 5218
},
{
"epoch": 2.579903596588802,
"grad_norm": 0.12133620846436559,
"learning_rate": 5.617610729072787e-06,
"loss": 0.2913,
"step": 5219
},
{
"epoch": 2.5803979730564826,
"grad_norm": 0.11894669968292397,
"learning_rate": 5.614111002735461e-06,
"loss": 0.2957,
"step": 5220
},
{
"epoch": 2.580892349524163,
"grad_norm": 0.12124165628830338,
"learning_rate": 5.610611941404181e-06,
"loss": 0.2985,
"step": 5221
},
{
"epoch": 2.581386725991843,
"grad_norm": 0.11810350063805412,
"learning_rate": 5.607113545609495e-06,
"loss": 0.2789,
"step": 5222
},
{
"epoch": 2.5818811024595227,
"grad_norm": 0.11640459694501548,
"learning_rate": 5.603615815881845e-06,
"loss": 0.2989,
"step": 5223
},
{
"epoch": 2.582375478927203,
"grad_norm": 0.11818723291896666,
"learning_rate": 5.600118752751562e-06,
"loss": 0.3035,
"step": 5224
},
{
"epoch": 2.5828698553948835,
"grad_norm": 0.11400700868308114,
"learning_rate": 5.5966223567488975e-06,
"loss": 0.2791,
"step": 5225
},
{
"epoch": 2.5833642318625634,
"grad_norm": 0.11288247461686995,
"learning_rate": 5.59312662840398e-06,
"loss": 0.2916,
"step": 5226
},
{
"epoch": 2.5838586083302433,
"grad_norm": 0.121645229809868,
"learning_rate": 5.589631568246841e-06,
"loss": 0.2851,
"step": 5227
},
{
"epoch": 2.5843529847979236,
"grad_norm": 0.11409409339275682,
"learning_rate": 5.586137176807429e-06,
"loss": 0.2965,
"step": 5228
},
{
"epoch": 2.584847361265604,
"grad_norm": 0.11647540651952804,
"learning_rate": 5.582643454615572e-06,
"loss": 0.281,
"step": 5229
},
{
"epoch": 2.585341737733284,
"grad_norm": 0.12875713094544353,
"learning_rate": 5.579150402200997e-06,
"loss": 0.3084,
"step": 5230
},
{
"epoch": 2.585836114200964,
"grad_norm": 0.11789579094858117,
"learning_rate": 5.575658020093342e-06,
"loss": 0.3508,
"step": 5231
},
{
"epoch": 2.586330490668644,
"grad_norm": 0.12835943199643018,
"learning_rate": 5.57216630882213e-06,
"loss": 0.3108,
"step": 5232
},
{
"epoch": 2.5868248671363245,
"grad_norm": 0.12151644358828899,
"learning_rate": 5.56867526891679e-06,
"loss": 0.2961,
"step": 5233
},
{
"epoch": 2.5873192436040044,
"grad_norm": 0.11741048267776535,
"learning_rate": 5.565184900906653e-06,
"loss": 0.2872,
"step": 5234
},
{
"epoch": 2.5878136200716844,
"grad_norm": 0.12364878044987142,
"learning_rate": 5.561695205320937e-06,
"loss": 0.3307,
"step": 5235
},
{
"epoch": 2.5883079965393647,
"grad_norm": 0.11623382446914379,
"learning_rate": 5.558206182688762e-06,
"loss": 0.2928,
"step": 5236
},
{
"epoch": 2.588802373007045,
"grad_norm": 0.11632553012047736,
"learning_rate": 5.5547178335391536e-06,
"loss": 0.2893,
"step": 5237
},
{
"epoch": 2.589296749474725,
"grad_norm": 0.11627103208245501,
"learning_rate": 5.551230158401021e-06,
"loss": 0.2728,
"step": 5238
},
{
"epoch": 2.589791125942405,
"grad_norm": 0.12291529367572367,
"learning_rate": 5.547743157803185e-06,
"loss": 0.3009,
"step": 5239
},
{
"epoch": 2.5902855024100853,
"grad_norm": 0.11808929025484002,
"learning_rate": 5.544256832274362e-06,
"loss": 0.2821,
"step": 5240
},
{
"epoch": 2.5907798788777656,
"grad_norm": 0.11541136502601232,
"learning_rate": 5.5407711823431545e-06,
"loss": 0.285,
"step": 5241
},
{
"epoch": 2.5912742553454455,
"grad_norm": 0.11737205668711684,
"learning_rate": 5.537286208538077e-06,
"loss": 0.2796,
"step": 5242
},
{
"epoch": 2.5917686318131254,
"grad_norm": 0.1192819429953958,
"learning_rate": 5.53380191138753e-06,
"loss": 0.3019,
"step": 5243
},
{
"epoch": 2.592263008280806,
"grad_norm": 0.12684663412228503,
"learning_rate": 5.530318291419821e-06,
"loss": 0.2997,
"step": 5244
},
{
"epoch": 2.592757384748486,
"grad_norm": 0.1209416395847066,
"learning_rate": 5.5268353491631525e-06,
"loss": 0.2977,
"step": 5245
},
{
"epoch": 2.593251761216166,
"grad_norm": 0.11437162221877069,
"learning_rate": 5.523353085145617e-06,
"loss": 0.2776,
"step": 5246
},
{
"epoch": 2.593746137683846,
"grad_norm": 0.11832951449913524,
"learning_rate": 5.519871499895208e-06,
"loss": 0.3088,
"step": 5247
},
{
"epoch": 2.5942405141515263,
"grad_norm": 0.12143605022323387,
"learning_rate": 5.516390593939824e-06,
"loss": 0.2794,
"step": 5248
},
{
"epoch": 2.5947348906192067,
"grad_norm": 0.11701183995717546,
"learning_rate": 5.512910367807246e-06,
"loss": 0.2931,
"step": 5249
},
{
"epoch": 2.5952292670868866,
"grad_norm": 0.12383154871238491,
"learning_rate": 5.509430822025163e-06,
"loss": 0.29,
"step": 5250
},
{
"epoch": 2.5957236435545665,
"grad_norm": 0.11903227407481952,
"learning_rate": 5.505951957121165e-06,
"loss": 0.2774,
"step": 5251
},
{
"epoch": 2.596218020022247,
"grad_norm": 0.11857953719464964,
"learning_rate": 5.502473773622723e-06,
"loss": 0.2927,
"step": 5252
},
{
"epoch": 2.5967123964899272,
"grad_norm": 0.1187410762546295,
"learning_rate": 5.498996272057213e-06,
"loss": 0.3013,
"step": 5253
},
{
"epoch": 2.597206772957607,
"grad_norm": 0.11800047048143808,
"learning_rate": 5.495519452951908e-06,
"loss": 0.2921,
"step": 5254
},
{
"epoch": 2.5977011494252875,
"grad_norm": 0.12202830832067574,
"learning_rate": 5.492043316833984e-06,
"loss": 0.2829,
"step": 5255
},
{
"epoch": 2.5981955258929674,
"grad_norm": 0.11879426420055163,
"learning_rate": 5.488567864230499e-06,
"loss": 0.3088,
"step": 5256
},
{
"epoch": 2.598689902360648,
"grad_norm": 0.12183213911476094,
"learning_rate": 5.485093095668419e-06,
"loss": 0.2851,
"step": 5257
},
{
"epoch": 2.5991842788283277,
"grad_norm": 0.11986219717340525,
"learning_rate": 5.4816190116746e-06,
"loss": 0.2992,
"step": 5258
},
{
"epoch": 2.599678655296008,
"grad_norm": 0.12317928729897132,
"learning_rate": 5.478145612775799e-06,
"loss": 0.2982,
"step": 5259
},
{
"epoch": 2.600173031763688,
"grad_norm": 0.11570463591142735,
"learning_rate": 5.474672899498663e-06,
"loss": 0.291,
"step": 5260
},
{
"epoch": 2.6006674082313683,
"grad_norm": 0.11588927412120073,
"learning_rate": 5.471200872369744e-06,
"loss": 0.3027,
"step": 5261
},
{
"epoch": 2.6011617846990482,
"grad_norm": 0.12090386797414497,
"learning_rate": 5.46772953191548e-06,
"loss": 0.2936,
"step": 5262
},
{
"epoch": 2.6016561611667286,
"grad_norm": 0.1131041748725753,
"learning_rate": 5.464258878662212e-06,
"loss": 0.2969,
"step": 5263
},
{
"epoch": 2.6021505376344085,
"grad_norm": 0.11953685724541199,
"learning_rate": 5.460788913136173e-06,
"loss": 0.3065,
"step": 5264
},
{
"epoch": 2.602644914102089,
"grad_norm": 0.12257604314697466,
"learning_rate": 5.45731963586349e-06,
"loss": 0.3071,
"step": 5265
},
{
"epoch": 2.603139290569769,
"grad_norm": 0.12215330032031896,
"learning_rate": 5.453851047370198e-06,
"loss": 0.2868,
"step": 5266
},
{
"epoch": 2.603633667037449,
"grad_norm": 0.11957747495271412,
"learning_rate": 5.45038314818221e-06,
"loss": 0.2958,
"step": 5267
},
{
"epoch": 2.604128043505129,
"grad_norm": 0.11985471563591327,
"learning_rate": 5.4469159388253475e-06,
"loss": 0.2912,
"step": 5268
},
{
"epoch": 2.6046224199728094,
"grad_norm": 0.11772839090764832,
"learning_rate": 5.443449419825321e-06,
"loss": 0.2753,
"step": 5269
},
{
"epoch": 2.6051167964404893,
"grad_norm": 0.11828146891177807,
"learning_rate": 5.439983591707734e-06,
"loss": 0.2857,
"step": 5270
},
{
"epoch": 2.6056111729081697,
"grad_norm": 0.11625412768690119,
"learning_rate": 5.436518454998092e-06,
"loss": 0.2872,
"step": 5271
},
{
"epoch": 2.6061055493758496,
"grad_norm": 0.1149711302700301,
"learning_rate": 5.433054010221798e-06,
"loss": 0.2849,
"step": 5272
},
{
"epoch": 2.60659992584353,
"grad_norm": 0.12845030547743455,
"learning_rate": 5.429590257904136e-06,
"loss": 0.3153,
"step": 5273
},
{
"epoch": 2.60709430231121,
"grad_norm": 0.12263122046430407,
"learning_rate": 5.426127198570303e-06,
"loss": 0.3255,
"step": 5274
},
{
"epoch": 2.6075886787788902,
"grad_norm": 0.11360071111083957,
"learning_rate": 5.422664832745379e-06,
"loss": 0.2881,
"step": 5275
},
{
"epoch": 2.60808305524657,
"grad_norm": 0.11188609204404804,
"learning_rate": 5.41920316095433e-06,
"loss": 0.2903,
"step": 5276
},
{
"epoch": 2.6085774317142505,
"grad_norm": 0.11859456675871563,
"learning_rate": 5.415742183722048e-06,
"loss": 0.2842,
"step": 5277
},
{
"epoch": 2.6090718081819304,
"grad_norm": 0.11812360687704618,
"learning_rate": 5.4122819015732915e-06,
"loss": 0.2781,
"step": 5278
},
{
"epoch": 2.6095661846496108,
"grad_norm": 0.11859585268239224,
"learning_rate": 5.408822315032718e-06,
"loss": 0.3019,
"step": 5279
},
{
"epoch": 2.6100605611172907,
"grad_norm": 0.11791322242747979,
"learning_rate": 5.405363424624891e-06,
"loss": 0.2947,
"step": 5280
},
{
"epoch": 2.610554937584971,
"grad_norm": 0.12641958797683678,
"learning_rate": 5.4019052308742545e-06,
"loss": 0.2664,
"step": 5281
},
{
"epoch": 2.611049314052651,
"grad_norm": 0.1201228190295021,
"learning_rate": 5.398447734305157e-06,
"loss": 0.2929,
"step": 5282
},
{
"epoch": 2.6115436905203313,
"grad_norm": 0.12320799508262395,
"learning_rate": 5.394990935441843e-06,
"loss": 0.294,
"step": 5283
},
{
"epoch": 2.612038066988011,
"grad_norm": 0.1222309218364277,
"learning_rate": 5.39153483480844e-06,
"loss": 0.3003,
"step": 5284
},
{
"epoch": 2.6125324434556916,
"grad_norm": 0.12362015520489734,
"learning_rate": 5.388079432928974e-06,
"loss": 0.2925,
"step": 5285
},
{
"epoch": 2.6130268199233715,
"grad_norm": 0.117990176846485,
"learning_rate": 5.384624730327375e-06,
"loss": 0.2898,
"step": 5286
},
{
"epoch": 2.613521196391052,
"grad_norm": 0.1156701558222564,
"learning_rate": 5.38117072752745e-06,
"loss": 0.27,
"step": 5287
},
{
"epoch": 2.6140155728587318,
"grad_norm": 0.11975300297780496,
"learning_rate": 5.377717425052912e-06,
"loss": 0.3066,
"step": 5288
},
{
"epoch": 2.614509949326412,
"grad_norm": 0.13267225547164677,
"learning_rate": 5.374264823427368e-06,
"loss": 0.3025,
"step": 5289
},
{
"epoch": 2.615004325794092,
"grad_norm": 0.11860123095461468,
"learning_rate": 5.370812923174311e-06,
"loss": 0.2928,
"step": 5290
},
{
"epoch": 2.6154987022617724,
"grad_norm": 0.11931850277877938,
"learning_rate": 5.367361724817136e-06,
"loss": 0.2827,
"step": 5291
},
{
"epoch": 2.6159930787294527,
"grad_norm": 0.12481529253615642,
"learning_rate": 5.363911228879125e-06,
"loss": 0.3066,
"step": 5292
},
{
"epoch": 2.6164874551971327,
"grad_norm": 0.1267396199374545,
"learning_rate": 5.360461435883448e-06,
"loss": 0.303,
"step": 5293
},
{
"epoch": 2.6169818316648126,
"grad_norm": 0.11820432958015449,
"learning_rate": 5.3570123463531935e-06,
"loss": 0.2816,
"step": 5294
},
{
"epoch": 2.617476208132493,
"grad_norm": 0.12409476033445171,
"learning_rate": 5.3535639608113165e-06,
"loss": 0.3052,
"step": 5295
},
{
"epoch": 2.6179705846001733,
"grad_norm": 0.120064640412,
"learning_rate": 5.3501162797806706e-06,
"loss": 0.2851,
"step": 5296
},
{
"epoch": 2.618464961067853,
"grad_norm": 0.12654161451417154,
"learning_rate": 5.346669303784018e-06,
"loss": 0.2962,
"step": 5297
},
{
"epoch": 2.618959337535533,
"grad_norm": 0.12244864746154716,
"learning_rate": 5.343223033343992e-06,
"loss": 0.325,
"step": 5298
},
{
"epoch": 2.6194537140032135,
"grad_norm": 0.12801457294303026,
"learning_rate": 5.339777468983135e-06,
"loss": 0.2955,
"step": 5299
},
{
"epoch": 2.619948090470894,
"grad_norm": 0.1254378208105475,
"learning_rate": 5.3363326112238825e-06,
"loss": 0.2909,
"step": 5300
},
{
"epoch": 2.6204424669385737,
"grad_norm": 0.12203033962378684,
"learning_rate": 5.33288846058855e-06,
"loss": 0.292,
"step": 5301
},
{
"epoch": 2.6209368434062537,
"grad_norm": 0.11842813389469159,
"learning_rate": 5.329445017599354e-06,
"loss": 0.3029,
"step": 5302
},
{
"epoch": 2.621431219873934,
"grad_norm": 0.12184081123115097,
"learning_rate": 5.326002282778409e-06,
"loss": 0.3087,
"step": 5303
},
{
"epoch": 2.6219255963416144,
"grad_norm": 0.12118019257280965,
"learning_rate": 5.322560256647706e-06,
"loss": 0.2789,
"step": 5304
},
{
"epoch": 2.6224199728092943,
"grad_norm": 0.12521863293020863,
"learning_rate": 5.319118939729146e-06,
"loss": 0.2997,
"step": 5305
},
{
"epoch": 2.622914349276974,
"grad_norm": 0.11979586303671466,
"learning_rate": 5.31567833254452e-06,
"loss": 0.2848,
"step": 5306
},
{
"epoch": 2.6234087257446546,
"grad_norm": 0.11781060045295265,
"learning_rate": 5.312238435615495e-06,
"loss": 0.3078,
"step": 5307
},
{
"epoch": 2.623903102212335,
"grad_norm": 0.1257480569156441,
"learning_rate": 5.308799249463652e-06,
"loss": 0.3066,
"step": 5308
},
{
"epoch": 2.624397478680015,
"grad_norm": 0.11467512929884713,
"learning_rate": 5.305360774610446e-06,
"loss": 0.2933,
"step": 5309
},
{
"epoch": 2.6248918551476947,
"grad_norm": 0.12161110833350373,
"learning_rate": 5.301923011577242e-06,
"loss": 0.2814,
"step": 5310
},
{
"epoch": 2.625386231615375,
"grad_norm": 0.11926134314269216,
"learning_rate": 5.298485960885276e-06,
"loss": 0.2742,
"step": 5311
},
{
"epoch": 2.6258806080830555,
"grad_norm": 0.11983715226571397,
"learning_rate": 5.295049623055697e-06,
"loss": 0.3099,
"step": 5312
},
{
"epoch": 2.6263749845507354,
"grad_norm": 0.12099152552619129,
"learning_rate": 5.291613998609528e-06,
"loss": 0.3067,
"step": 5313
},
{
"epoch": 2.6268693610184153,
"grad_norm": 0.11838952012931128,
"learning_rate": 5.288179088067697e-06,
"loss": 0.296,
"step": 5314
},
{
"epoch": 2.6273637374860956,
"grad_norm": 0.12323796168739017,
"learning_rate": 5.284744891951024e-06,
"loss": 0.3169,
"step": 5315
},
{
"epoch": 2.627858113953776,
"grad_norm": 0.12047081407323054,
"learning_rate": 5.281311410780203e-06,
"loss": 0.2839,
"step": 5316
},
{
"epoch": 2.628352490421456,
"grad_norm": 0.12536740134207602,
"learning_rate": 5.277878645075845e-06,
"loss": 0.2891,
"step": 5317
},
{
"epoch": 2.628846866889136,
"grad_norm": 0.11742359585414734,
"learning_rate": 5.274446595358434e-06,
"loss": 0.298,
"step": 5318
},
{
"epoch": 2.629341243356816,
"grad_norm": 0.1174798023795677,
"learning_rate": 5.2710152621483465e-06,
"loss": 0.2889,
"step": 5319
},
{
"epoch": 2.6298356198244965,
"grad_norm": 0.12633013781806685,
"learning_rate": 5.26758464596586e-06,
"loss": 0.3008,
"step": 5320
},
{
"epoch": 2.6303299962921765,
"grad_norm": 0.11910631839996609,
"learning_rate": 5.2641547473311405e-06,
"loss": 0.3006,
"step": 5321
},
{
"epoch": 2.6308243727598564,
"grad_norm": 0.12164405949715995,
"learning_rate": 5.260725566764237e-06,
"loss": 0.3,
"step": 5322
},
{
"epoch": 2.6313187492275367,
"grad_norm": 0.11904781152484892,
"learning_rate": 5.257297104785103e-06,
"loss": 0.281,
"step": 5323
},
{
"epoch": 2.631813125695217,
"grad_norm": 0.11450415613142828,
"learning_rate": 5.253869361913571e-06,
"loss": 0.2951,
"step": 5324
},
{
"epoch": 2.632307502162897,
"grad_norm": 0.12604630837820327,
"learning_rate": 5.250442338669362e-06,
"loss": 0.2943,
"step": 5325
},
{
"epoch": 2.632801878630577,
"grad_norm": 0.11918985254852647,
"learning_rate": 5.247016035572109e-06,
"loss": 0.3311,
"step": 5326
},
{
"epoch": 2.6332962550982573,
"grad_norm": 0.12227035337242163,
"learning_rate": 5.2435904531413165e-06,
"loss": 0.2923,
"step": 5327
},
{
"epoch": 2.6337906315659376,
"grad_norm": 0.12389181551478365,
"learning_rate": 5.240165591896378e-06,
"loss": 0.321,
"step": 5328
},
{
"epoch": 2.6342850080336175,
"grad_norm": 0.1167273485376711,
"learning_rate": 5.236741452356596e-06,
"loss": 0.2998,
"step": 5329
},
{
"epoch": 2.634779384501298,
"grad_norm": 0.12026443469760356,
"learning_rate": 5.233318035041143e-06,
"loss": 0.2924,
"step": 5330
},
{
"epoch": 2.635273760968978,
"grad_norm": 0.1255245424119962,
"learning_rate": 5.229895340469093e-06,
"loss": 0.3136,
"step": 5331
},
{
"epoch": 2.635768137436658,
"grad_norm": 0.1182941099906537,
"learning_rate": 5.226473369159417e-06,
"loss": 0.298,
"step": 5332
},
{
"epoch": 2.636262513904338,
"grad_norm": 0.1218123196276219,
"learning_rate": 5.223052121630956e-06,
"loss": 0.2954,
"step": 5333
},
{
"epoch": 2.6367568903720184,
"grad_norm": 0.12126451761189722,
"learning_rate": 5.219631598402464e-06,
"loss": 0.3035,
"step": 5334
},
{
"epoch": 2.6372512668396983,
"grad_norm": 0.12188235340612692,
"learning_rate": 5.216211799992568e-06,
"loss": 0.2804,
"step": 5335
},
{
"epoch": 2.6377456433073787,
"grad_norm": 0.11677866583357024,
"learning_rate": 5.21279272691979e-06,
"loss": 0.2812,
"step": 5336
},
{
"epoch": 2.6382400197750586,
"grad_norm": 0.11396051813304983,
"learning_rate": 5.209374379702545e-06,
"loss": 0.2957,
"step": 5337
},
{
"epoch": 2.638734396242739,
"grad_norm": 0.12131803149392076,
"learning_rate": 5.205956758859143e-06,
"loss": 0.3041,
"step": 5338
},
{
"epoch": 2.639228772710419,
"grad_norm": 0.11480627496371006,
"learning_rate": 5.202539864907767e-06,
"loss": 0.2791,
"step": 5339
},
{
"epoch": 2.6397231491780992,
"grad_norm": 0.12239338950217526,
"learning_rate": 5.19912369836651e-06,
"loss": 0.2878,
"step": 5340
},
{
"epoch": 2.640217525645779,
"grad_norm": 0.11812118955597549,
"learning_rate": 5.195708259753341e-06,
"loss": 0.2849,
"step": 5341
},
{
"epoch": 2.6407119021134595,
"grad_norm": 0.11819014727513683,
"learning_rate": 5.1922935495861125e-06,
"loss": 0.2963,
"step": 5342
},
{
"epoch": 2.6412062785811394,
"grad_norm": 0.11959389618480203,
"learning_rate": 5.188879568382595e-06,
"loss": 0.2906,
"step": 5343
},
{
"epoch": 2.64170065504882,
"grad_norm": 0.12520114271419622,
"learning_rate": 5.185466316660419e-06,
"loss": 0.2862,
"step": 5344
},
{
"epoch": 2.6421950315164997,
"grad_norm": 0.1197891964575316,
"learning_rate": 5.182053794937114e-06,
"loss": 0.301,
"step": 5345
},
{
"epoch": 2.64268940798418,
"grad_norm": 0.11995786458212282,
"learning_rate": 5.178642003730107e-06,
"loss": 0.2724,
"step": 5346
},
{
"epoch": 2.64318378445186,
"grad_norm": 0.12044202873475059,
"learning_rate": 5.1752309435567e-06,
"loss": 0.2869,
"step": 5347
},
{
"epoch": 2.6436781609195403,
"grad_norm": 0.11848441575716738,
"learning_rate": 5.171820614934094e-06,
"loss": 0.2927,
"step": 5348
},
{
"epoch": 2.6441725373872202,
"grad_norm": 0.12923720751927897,
"learning_rate": 5.168411018379384e-06,
"loss": 0.3331,
"step": 5349
},
{
"epoch": 2.6446669138549006,
"grad_norm": 0.13148734437360807,
"learning_rate": 5.165002154409538e-06,
"loss": 0.3021,
"step": 5350
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.11609291983955608,
"learning_rate": 5.161594023541423e-06,
"loss": 0.2969,
"step": 5351
},
{
"epoch": 2.645655666790261,
"grad_norm": 0.11972915354564978,
"learning_rate": 5.1581866262917965e-06,
"loss": 0.2856,
"step": 5352
},
{
"epoch": 2.646150043257941,
"grad_norm": 0.12060691511060487,
"learning_rate": 5.154779963177299e-06,
"loss": 0.3054,
"step": 5353
},
{
"epoch": 2.646644419725621,
"grad_norm": 0.11616269841866343,
"learning_rate": 5.15137403471446e-06,
"loss": 0.2801,
"step": 5354
},
{
"epoch": 2.647138796193301,
"grad_norm": 0.12477719571952374,
"learning_rate": 5.1479688414197095e-06,
"loss": 0.2992,
"step": 5355
},
{
"epoch": 2.6476331726609814,
"grad_norm": 0.11809261504781512,
"learning_rate": 5.144564383809345e-06,
"loss": 0.3261,
"step": 5356
},
{
"epoch": 2.6481275491286613,
"grad_norm": 0.12033200426559829,
"learning_rate": 5.141160662399575e-06,
"loss": 0.2873,
"step": 5357
},
{
"epoch": 2.6486219255963417,
"grad_norm": 0.12145647491973025,
"learning_rate": 5.1377576777064745e-06,
"loss": 0.2932,
"step": 5358
},
{
"epoch": 2.6491163020640216,
"grad_norm": 0.11683998368861097,
"learning_rate": 5.134355430246027e-06,
"loss": 0.2921,
"step": 5359
},
{
"epoch": 2.649610678531702,
"grad_norm": 0.118028448040275,
"learning_rate": 5.1309539205340875e-06,
"loss": 0.2908,
"step": 5360
},
{
"epoch": 2.650105054999382,
"grad_norm": 0.12215643733539432,
"learning_rate": 5.1275531490864135e-06,
"loss": 0.2902,
"step": 5361
},
{
"epoch": 2.6505994314670622,
"grad_norm": 0.11844984188826191,
"learning_rate": 5.124153116418636e-06,
"loss": 0.301,
"step": 5362
},
{
"epoch": 2.651093807934742,
"grad_norm": 0.11726039343653083,
"learning_rate": 5.1207538230462896e-06,
"loss": 0.2802,
"step": 5363
},
{
"epoch": 2.6515881844024225,
"grad_norm": 0.1214403926983085,
"learning_rate": 5.1173552694847804e-06,
"loss": 0.2904,
"step": 5364
},
{
"epoch": 2.6520825608701024,
"grad_norm": 0.13847644781186264,
"learning_rate": 5.113957456249414e-06,
"loss": 0.2998,
"step": 5365
},
{
"epoch": 2.6525769373377828,
"grad_norm": 0.11901879773376746,
"learning_rate": 5.110560383855387e-06,
"loss": 0.2966,
"step": 5366
},
{
"epoch": 2.653071313805463,
"grad_norm": 0.11651029395966365,
"learning_rate": 5.10716405281777e-06,
"loss": 0.2718,
"step": 5367
},
{
"epoch": 2.653565690273143,
"grad_norm": 0.12037734379383914,
"learning_rate": 5.103768463651528e-06,
"loss": 0.2844,
"step": 5368
},
{
"epoch": 2.654060066740823,
"grad_norm": 0.11919782486986759,
"learning_rate": 5.100373616871514e-06,
"loss": 0.2775,
"step": 5369
},
{
"epoch": 2.6545544432085033,
"grad_norm": 0.11802010654199709,
"learning_rate": 5.096979512992475e-06,
"loss": 0.308,
"step": 5370
},
{
"epoch": 2.6550488196761837,
"grad_norm": 0.12277678492018362,
"learning_rate": 5.093586152529028e-06,
"loss": 0.3156,
"step": 5371
},
{
"epoch": 2.6555431961438636,
"grad_norm": 0.11894620511769491,
"learning_rate": 5.090193535995698e-06,
"loss": 0.2961,
"step": 5372
},
{
"epoch": 2.6560375726115435,
"grad_norm": 0.11544858344947426,
"learning_rate": 5.0868016639068825e-06,
"loss": 0.2783,
"step": 5373
},
{
"epoch": 2.656531949079224,
"grad_norm": 0.11290375927802086,
"learning_rate": 5.083410536776867e-06,
"loss": 0.2942,
"step": 5374
},
{
"epoch": 2.657026325546904,
"grad_norm": 0.1223849929688257,
"learning_rate": 5.0800201551198315e-06,
"loss": 0.2978,
"step": 5375
},
{
"epoch": 2.657520702014584,
"grad_norm": 0.11723036248363425,
"learning_rate": 5.076630519449843e-06,
"loss": 0.2856,
"step": 5376
},
{
"epoch": 2.658015078482264,
"grad_norm": 0.11695186314412198,
"learning_rate": 5.073241630280845e-06,
"loss": 0.296,
"step": 5377
},
{
"epoch": 2.6585094549499444,
"grad_norm": 0.11510858725905088,
"learning_rate": 5.06985348812668e-06,
"loss": 0.2788,
"step": 5378
},
{
"epoch": 2.6590038314176248,
"grad_norm": 0.11705813924046564,
"learning_rate": 5.066466093501066e-06,
"loss": 0.2879,
"step": 5379
},
{
"epoch": 2.6594982078853047,
"grad_norm": 0.11691175677360813,
"learning_rate": 5.063079446917616e-06,
"loss": 0.3038,
"step": 5380
},
{
"epoch": 2.6599925843529846,
"grad_norm": 0.11451798695438295,
"learning_rate": 5.059693548889832e-06,
"loss": 0.2689,
"step": 5381
},
{
"epoch": 2.660486960820665,
"grad_norm": 0.11789201852726396,
"learning_rate": 5.056308399931087e-06,
"loss": 0.2998,
"step": 5382
},
{
"epoch": 2.6609813372883453,
"grad_norm": 0.11801571041267268,
"learning_rate": 5.052924000554662e-06,
"loss": 0.2844,
"step": 5383
},
{
"epoch": 2.661475713756025,
"grad_norm": 0.11846040282388662,
"learning_rate": 5.049540351273708e-06,
"loss": 0.3045,
"step": 5384
},
{
"epoch": 2.661970090223705,
"grad_norm": 0.12068569329013065,
"learning_rate": 5.0461574526012616e-06,
"loss": 0.3088,
"step": 5385
},
{
"epoch": 2.6624644666913855,
"grad_norm": 0.11923001746427282,
"learning_rate": 5.042775305050258e-06,
"loss": 0.2874,
"step": 5386
},
{
"epoch": 2.662958843159066,
"grad_norm": 0.11583831545764933,
"learning_rate": 5.039393909133515e-06,
"loss": 0.2782,
"step": 5387
},
{
"epoch": 2.6634532196267457,
"grad_norm": 0.12399828735888864,
"learning_rate": 5.036013265363724e-06,
"loss": 0.2952,
"step": 5388
},
{
"epoch": 2.6639475960944257,
"grad_norm": 0.12312042478424284,
"learning_rate": 5.0326333742534814e-06,
"loss": 0.2994,
"step": 5389
},
{
"epoch": 2.664441972562106,
"grad_norm": 0.1197897436775116,
"learning_rate": 5.029254236315257e-06,
"loss": 0.2899,
"step": 5390
},
{
"epoch": 2.6649363490297864,
"grad_norm": 0.1219876328252398,
"learning_rate": 5.025875852061399e-06,
"loss": 0.2836,
"step": 5391
},
{
"epoch": 2.6654307254974663,
"grad_norm": 0.11267832851605718,
"learning_rate": 5.0224982220041686e-06,
"loss": 0.3033,
"step": 5392
},
{
"epoch": 2.665925101965146,
"grad_norm": 0.1199870034427511,
"learning_rate": 5.019121346655687e-06,
"loss": 0.3084,
"step": 5393
},
{
"epoch": 2.6664194784328266,
"grad_norm": 0.12732007178152724,
"learning_rate": 5.015745226527966e-06,
"loss": 0.2807,
"step": 5394
},
{
"epoch": 2.666913854900507,
"grad_norm": 0.15437868475266958,
"learning_rate": 5.0123698621329145e-06,
"loss": 0.29,
"step": 5395
},
{
"epoch": 2.667408231368187,
"grad_norm": 0.11839066797018534,
"learning_rate": 5.0089952539823095e-06,
"loss": 0.2822,
"step": 5396
},
{
"epoch": 2.6679026078358667,
"grad_norm": 0.12347874280911467,
"learning_rate": 5.005621402587829e-06,
"loss": 0.2778,
"step": 5397
},
{
"epoch": 2.668396984303547,
"grad_norm": 0.11852121070860304,
"learning_rate": 5.002248308461032e-06,
"loss": 0.267,
"step": 5398
},
{
"epoch": 2.6688913607712275,
"grad_norm": 0.11650564452854686,
"learning_rate": 4.998875972113356e-06,
"loss": 0.2907,
"step": 5399
},
{
"epoch": 2.6693857372389074,
"grad_norm": 0.12843546441219036,
"learning_rate": 4.9955043940561264e-06,
"loss": 0.3068,
"step": 5400
},
{
"epoch": 2.6698801137065877,
"grad_norm": 0.11946189837409016,
"learning_rate": 4.992133574800563e-06,
"loss": 0.2974,
"step": 5401
},
{
"epoch": 2.6703744901742676,
"grad_norm": 0.12191619774994343,
"learning_rate": 4.988763514857753e-06,
"loss": 0.2971,
"step": 5402
},
{
"epoch": 2.670868866641948,
"grad_norm": 0.1166839703861477,
"learning_rate": 4.985394214738683e-06,
"loss": 0.2751,
"step": 5403
},
{
"epoch": 2.671363243109628,
"grad_norm": 0.1206208989150297,
"learning_rate": 4.9820256749542255e-06,
"loss": 0.2825,
"step": 5404
},
{
"epoch": 2.6718576195773083,
"grad_norm": 0.12252177521717676,
"learning_rate": 4.978657896015121e-06,
"loss": 0.2905,
"step": 5405
},
{
"epoch": 2.672351996044988,
"grad_norm": 0.12117987345326577,
"learning_rate": 4.975290878432016e-06,
"loss": 0.2961,
"step": 5406
},
{
"epoch": 2.6728463725126685,
"grad_norm": 0.12382084701120458,
"learning_rate": 4.971924622715423e-06,
"loss": 0.2818,
"step": 5407
},
{
"epoch": 2.6733407489803485,
"grad_norm": 0.12584694180281478,
"learning_rate": 4.968559129375751e-06,
"loss": 0.2871,
"step": 5408
},
{
"epoch": 2.673835125448029,
"grad_norm": 0.11593227337261211,
"learning_rate": 4.965194398923293e-06,
"loss": 0.2844,
"step": 5409
},
{
"epoch": 2.6743295019157087,
"grad_norm": 0.11972144545205411,
"learning_rate": 4.9618304318682185e-06,
"loss": 0.3007,
"step": 5410
},
{
"epoch": 2.674823878383389,
"grad_norm": 0.12075921548669978,
"learning_rate": 4.958467228720583e-06,
"loss": 0.2854,
"step": 5411
},
{
"epoch": 2.675318254851069,
"grad_norm": 0.11570727652279263,
"learning_rate": 4.955104789990336e-06,
"loss": 0.2903,
"step": 5412
},
{
"epoch": 2.6758126313187494,
"grad_norm": 0.13044156079815844,
"learning_rate": 4.9517431161872964e-06,
"loss": 0.3037,
"step": 5413
},
{
"epoch": 2.6763070077864293,
"grad_norm": 0.12169138775568004,
"learning_rate": 4.9483822078211775e-06,
"loss": 0.2947,
"step": 5414
},
{
"epoch": 2.6768013842541096,
"grad_norm": 0.12302919408415071,
"learning_rate": 4.945022065401579e-06,
"loss": 0.3132,
"step": 5415
},
{
"epoch": 2.6772957607217895,
"grad_norm": 0.12416929041031208,
"learning_rate": 4.941662689437975e-06,
"loss": 0.2818,
"step": 5416
},
{
"epoch": 2.67779013718947,
"grad_norm": 0.12078389603902662,
"learning_rate": 4.938304080439722e-06,
"loss": 0.2964,
"step": 5417
},
{
"epoch": 2.67828451365715,
"grad_norm": 0.11776824761404174,
"learning_rate": 4.934946238916071e-06,
"loss": 0.3031,
"step": 5418
},
{
"epoch": 2.67877889012483,
"grad_norm": 0.1234238476302436,
"learning_rate": 4.931589165376157e-06,
"loss": 0.3053,
"step": 5419
},
{
"epoch": 2.67927326659251,
"grad_norm": 0.11781958607685705,
"learning_rate": 4.928232860328983e-06,
"loss": 0.2874,
"step": 5420
},
{
"epoch": 2.6797676430601904,
"grad_norm": 0.1178436220028783,
"learning_rate": 4.924877324283452e-06,
"loss": 0.2805,
"step": 5421
},
{
"epoch": 2.6802620195278704,
"grad_norm": 0.1240278048301458,
"learning_rate": 4.92152255774834e-06,
"loss": 0.3053,
"step": 5422
},
{
"epoch": 2.6807563959955507,
"grad_norm": 0.1242784012714598,
"learning_rate": 4.918168561232313e-06,
"loss": 0.3045,
"step": 5423
},
{
"epoch": 2.6812507724632306,
"grad_norm": 0.12078651130571633,
"learning_rate": 4.9148153352439135e-06,
"loss": 0.3129,
"step": 5424
},
{
"epoch": 2.681745148930911,
"grad_norm": 0.11733200644933925,
"learning_rate": 4.911462880291576e-06,
"loss": 0.2672,
"step": 5425
},
{
"epoch": 2.682239525398591,
"grad_norm": 0.11810775346700876,
"learning_rate": 4.908111196883608e-06,
"loss": 0.291,
"step": 5426
},
{
"epoch": 2.6827339018662713,
"grad_norm": 0.116315463264095,
"learning_rate": 4.904760285528211e-06,
"loss": 0.2787,
"step": 5427
},
{
"epoch": 2.683228278333951,
"grad_norm": 0.11834054679252035,
"learning_rate": 4.901410146733459e-06,
"loss": 0.3378,
"step": 5428
},
{
"epoch": 2.6837226548016315,
"grad_norm": 0.1196216645939279,
"learning_rate": 4.898060781007312e-06,
"loss": 0.2734,
"step": 5429
},
{
"epoch": 2.6842170312693114,
"grad_norm": 0.11944469482913422,
"learning_rate": 4.894712188857622e-06,
"loss": 0.2966,
"step": 5430
},
{
"epoch": 2.684711407736992,
"grad_norm": 0.11524171526734611,
"learning_rate": 4.8913643707921075e-06,
"loss": 0.2847,
"step": 5431
},
{
"epoch": 2.6852057842046717,
"grad_norm": 0.11735378467046226,
"learning_rate": 4.888017327318385e-06,
"loss": 0.278,
"step": 5432
},
{
"epoch": 2.685700160672352,
"grad_norm": 0.126258493458387,
"learning_rate": 4.8846710589439435e-06,
"loss": 0.2868,
"step": 5433
},
{
"epoch": 2.686194537140032,
"grad_norm": 0.11661564567004983,
"learning_rate": 4.881325566176154e-06,
"loss": 0.3139,
"step": 5434
},
{
"epoch": 2.6866889136077123,
"grad_norm": 0.12176650294825758,
"learning_rate": 4.8779808495222755e-06,
"loss": 0.2764,
"step": 5435
},
{
"epoch": 2.6871832900753923,
"grad_norm": 0.12250427607319674,
"learning_rate": 4.8746369094894544e-06,
"loss": 0.3041,
"step": 5436
},
{
"epoch": 2.6876776665430726,
"grad_norm": 0.12043172371724434,
"learning_rate": 4.871293746584701e-06,
"loss": 0.2812,
"step": 5437
},
{
"epoch": 2.688172043010753,
"grad_norm": 0.12299532010414013,
"learning_rate": 4.86795136131493e-06,
"loss": 0.2822,
"step": 5438
},
{
"epoch": 2.688666419478433,
"grad_norm": 0.13150756030348568,
"learning_rate": 4.864609754186921e-06,
"loss": 0.2754,
"step": 5439
},
{
"epoch": 2.689160795946113,
"grad_norm": 0.11732056714386403,
"learning_rate": 4.861268925707335e-06,
"loss": 0.2754,
"step": 5440
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.12089553338028602,
"learning_rate": 4.8579288763827384e-06,
"loss": 0.2942,
"step": 5441
},
{
"epoch": 2.6901495488814735,
"grad_norm": 0.12208429553181692,
"learning_rate": 4.854589606719553e-06,
"loss": 0.3039,
"step": 5442
},
{
"epoch": 2.6906439253491534,
"grad_norm": 0.12509280418063878,
"learning_rate": 4.851251117224089e-06,
"loss": 0.2898,
"step": 5443
},
{
"epoch": 2.6911383018168333,
"grad_norm": 0.12356288283078327,
"learning_rate": 4.84791340840255e-06,
"loss": 0.3352,
"step": 5444
},
{
"epoch": 2.6916326782845137,
"grad_norm": 0.15513727891604143,
"learning_rate": 4.844576480761005e-06,
"loss": 0.2814,
"step": 5445
},
{
"epoch": 2.692127054752194,
"grad_norm": 0.11906105363573367,
"learning_rate": 4.841240334805416e-06,
"loss": 0.2731,
"step": 5446
},
{
"epoch": 2.692621431219874,
"grad_norm": 0.11824889696358584,
"learning_rate": 4.837904971041626e-06,
"loss": 0.2904,
"step": 5447
},
{
"epoch": 2.693115807687554,
"grad_norm": 0.12218488895497576,
"learning_rate": 4.834570389975354e-06,
"loss": 0.2945,
"step": 5448
},
{
"epoch": 2.6936101841552342,
"grad_norm": 0.12518790425861143,
"learning_rate": 4.8312365921121965e-06,
"loss": 0.2989,
"step": 5449
},
{
"epoch": 2.6941045606229146,
"grad_norm": 0.12025149297300314,
"learning_rate": 4.827903577957646e-06,
"loss": 0.3015,
"step": 5450
},
{
"epoch": 2.6945989370905945,
"grad_norm": 0.12161453676668013,
"learning_rate": 4.8245713480170594e-06,
"loss": 0.3191,
"step": 5451
},
{
"epoch": 2.6950933135582744,
"grad_norm": 0.12373938307496132,
"learning_rate": 4.821239902795689e-06,
"loss": 0.3068,
"step": 5452
},
{
"epoch": 2.6955876900259548,
"grad_norm": 0.12960472364261474,
"learning_rate": 4.817909242798662e-06,
"loss": 0.2911,
"step": 5453
},
{
"epoch": 2.696082066493635,
"grad_norm": 0.12564599506009963,
"learning_rate": 4.8145793685309805e-06,
"loss": 0.2938,
"step": 5454
},
{
"epoch": 2.696576442961315,
"grad_norm": 0.1251254148866729,
"learning_rate": 4.811250280497541e-06,
"loss": 0.2857,
"step": 5455
},
{
"epoch": 2.697070819428995,
"grad_norm": 0.11825792932913422,
"learning_rate": 4.80792197920311e-06,
"loss": 0.2918,
"step": 5456
},
{
"epoch": 2.6975651958966753,
"grad_norm": 0.11699398792372175,
"learning_rate": 4.804594465152329e-06,
"loss": 0.2855,
"step": 5457
},
{
"epoch": 2.6980595723643557,
"grad_norm": 0.12781879237949395,
"learning_rate": 4.801267738849745e-06,
"loss": 0.2989,
"step": 5458
},
{
"epoch": 2.6985539488320356,
"grad_norm": 0.12514462360020984,
"learning_rate": 4.797941800799763e-06,
"loss": 0.2873,
"step": 5459
},
{
"epoch": 2.6990483252997155,
"grad_norm": 0.1253001419221765,
"learning_rate": 4.794616651506667e-06,
"loss": 0.3016,
"step": 5460
},
{
"epoch": 2.699542701767396,
"grad_norm": 0.11828579188084494,
"learning_rate": 4.791292291474643e-06,
"loss": 0.2845,
"step": 5461
},
{
"epoch": 2.700037078235076,
"grad_norm": 0.12372422953361027,
"learning_rate": 4.787968721207731e-06,
"loss": 0.3132,
"step": 5462
},
{
"epoch": 2.700531454702756,
"grad_norm": 0.12731089813797639,
"learning_rate": 4.7846459412098715e-06,
"loss": 0.2815,
"step": 5463
},
{
"epoch": 2.701025831170436,
"grad_norm": 0.122237045451103,
"learning_rate": 4.7813239519848795e-06,
"loss": 0.2992,
"step": 5464
},
{
"epoch": 2.7015202076381164,
"grad_norm": 0.11601498146430497,
"learning_rate": 4.778002754036445e-06,
"loss": 0.3087,
"step": 5465
},
{
"epoch": 2.7020145841057968,
"grad_norm": 0.1158264559731617,
"learning_rate": 4.774682347868137e-06,
"loss": 0.3177,
"step": 5466
},
{
"epoch": 2.7025089605734767,
"grad_norm": 0.12305366951007594,
"learning_rate": 4.7713627339834146e-06,
"loss": 0.299,
"step": 5467
},
{
"epoch": 2.7030033370411566,
"grad_norm": 0.12227587519086434,
"learning_rate": 4.768043912885612e-06,
"loss": 0.2913,
"step": 5468
},
{
"epoch": 2.703497713508837,
"grad_norm": 0.11880177700395178,
"learning_rate": 4.7647258850779364e-06,
"loss": 0.2976,
"step": 5469
},
{
"epoch": 2.7039920899765173,
"grad_norm": 0.1289142662073662,
"learning_rate": 4.761408651063487e-06,
"loss": 0.3058,
"step": 5470
},
{
"epoch": 2.704486466444197,
"grad_norm": 0.12178383828582083,
"learning_rate": 4.75809221134523e-06,
"loss": 0.2931,
"step": 5471
},
{
"epoch": 2.704980842911877,
"grad_norm": 0.11380077490045665,
"learning_rate": 4.7547765664260225e-06,
"loss": 0.304,
"step": 5472
},
{
"epoch": 2.7054752193795575,
"grad_norm": 0.1292711050847456,
"learning_rate": 4.751461716808591e-06,
"loss": 0.2994,
"step": 5473
},
{
"epoch": 2.705969595847238,
"grad_norm": 0.11884773489415933,
"learning_rate": 4.7481476629955515e-06,
"loss": 0.2954,
"step": 5474
},
{
"epoch": 2.7064639723149178,
"grad_norm": 0.12113154191560363,
"learning_rate": 4.744834405489388e-06,
"loss": 0.2964,
"step": 5475
},
{
"epoch": 2.706958348782598,
"grad_norm": 0.12289636407444,
"learning_rate": 4.7415219447924775e-06,
"loss": 0.2994,
"step": 5476
},
{
"epoch": 2.707452725250278,
"grad_norm": 0.11847431226543416,
"learning_rate": 4.73821028140706e-06,
"loss": 0.3122,
"step": 5477
},
{
"epoch": 2.7079471017179584,
"grad_norm": 0.11793520470898605,
"learning_rate": 4.734899415835267e-06,
"loss": 0.2811,
"step": 5478
},
{
"epoch": 2.7084414781856383,
"grad_norm": 0.11873148889863068,
"learning_rate": 4.73158934857911e-06,
"loss": 0.2939,
"step": 5479
},
{
"epoch": 2.7089358546533187,
"grad_norm": 0.11271481304383014,
"learning_rate": 4.728280080140466e-06,
"loss": 0.2901,
"step": 5480
},
{
"epoch": 2.7094302311209986,
"grad_norm": 0.1205405420544215,
"learning_rate": 4.724971611021107e-06,
"loss": 0.3154,
"step": 5481
},
{
"epoch": 2.709924607588679,
"grad_norm": 0.12053697089434831,
"learning_rate": 4.721663941722675e-06,
"loss": 0.2853,
"step": 5482
},
{
"epoch": 2.710418984056359,
"grad_norm": 0.12491815671460557,
"learning_rate": 4.7183570727466855e-06,
"loss": 0.2862,
"step": 5483
},
{
"epoch": 2.710913360524039,
"grad_norm": 0.11738897405612905,
"learning_rate": 4.715051004594543e-06,
"loss": 0.286,
"step": 5484
},
{
"epoch": 2.711407736991719,
"grad_norm": 0.1197135234971391,
"learning_rate": 4.7117457377675325e-06,
"loss": 0.2988,
"step": 5485
},
{
"epoch": 2.7119021134593995,
"grad_norm": 0.11649970778576438,
"learning_rate": 4.708441272766803e-06,
"loss": 0.3005,
"step": 5486
},
{
"epoch": 2.7123964899270794,
"grad_norm": 0.12075930551615234,
"learning_rate": 4.705137610093398e-06,
"loss": 0.2921,
"step": 5487
},
{
"epoch": 2.7128908663947597,
"grad_norm": 0.11911328984908721,
"learning_rate": 4.701834750248229e-06,
"loss": 0.2779,
"step": 5488
},
{
"epoch": 2.7133852428624397,
"grad_norm": 0.11820641897548595,
"learning_rate": 4.698532693732081e-06,
"loss": 0.2824,
"step": 5489
},
{
"epoch": 2.71387961933012,
"grad_norm": 0.11955486839788493,
"learning_rate": 4.69523144104564e-06,
"loss": 0.2901,
"step": 5490
},
{
"epoch": 2.7143739957978,
"grad_norm": 0.11526791505189006,
"learning_rate": 4.691930992689449e-06,
"loss": 0.2921,
"step": 5491
},
{
"epoch": 2.7148683722654803,
"grad_norm": 0.1177271210019719,
"learning_rate": 4.6886313491639276e-06,
"loss": 0.2969,
"step": 5492
},
{
"epoch": 2.71536274873316,
"grad_norm": 0.12073526221419056,
"learning_rate": 4.685332510969394e-06,
"loss": 0.2909,
"step": 5493
},
{
"epoch": 2.7158571252008405,
"grad_norm": 0.12177393534250863,
"learning_rate": 4.682034478606019e-06,
"loss": 0.2854,
"step": 5494
},
{
"epoch": 2.7163515016685205,
"grad_norm": 0.11579966863534,
"learning_rate": 4.67873725257387e-06,
"loss": 0.3083,
"step": 5495
},
{
"epoch": 2.716845878136201,
"grad_norm": 0.12021230135245277,
"learning_rate": 4.675440833372887e-06,
"loss": 0.2956,
"step": 5496
},
{
"epoch": 2.7173402546038807,
"grad_norm": 0.1175648742616284,
"learning_rate": 4.672145221502882e-06,
"loss": 0.2854,
"step": 5497
},
{
"epoch": 2.717834631071561,
"grad_norm": 0.12090419468355088,
"learning_rate": 4.668850417463553e-06,
"loss": 0.2884,
"step": 5498
},
{
"epoch": 2.718329007539241,
"grad_norm": 0.11807785726696417,
"learning_rate": 4.66555642175447e-06,
"loss": 0.2884,
"step": 5499
},
{
"epoch": 2.7188233840069214,
"grad_norm": 0.1168556948145553,
"learning_rate": 4.662263234875077e-06,
"loss": 0.2935,
"step": 5500
},
{
"epoch": 2.7193177604746013,
"grad_norm": 0.11747968433335873,
"learning_rate": 4.658970857324705e-06,
"loss": 0.2993,
"step": 5501
},
{
"epoch": 2.7198121369422816,
"grad_norm": 0.11422621473355463,
"learning_rate": 4.65567928960256e-06,
"loss": 0.3058,
"step": 5502
},
{
"epoch": 2.7203065134099615,
"grad_norm": 0.12049884678598423,
"learning_rate": 4.6523885322077145e-06,
"loss": 0.3035,
"step": 5503
},
{
"epoch": 2.720800889877642,
"grad_norm": 0.11719122161498179,
"learning_rate": 4.649098585639136e-06,
"loss": 0.2745,
"step": 5504
},
{
"epoch": 2.721295266345322,
"grad_norm": 0.12482890051562145,
"learning_rate": 4.645809450395654e-06,
"loss": 0.3064,
"step": 5505
},
{
"epoch": 2.721789642813002,
"grad_norm": 0.1160486745594322,
"learning_rate": 4.642521126975974e-06,
"loss": 0.29,
"step": 5506
},
{
"epoch": 2.722284019280682,
"grad_norm": 0.11817945308198755,
"learning_rate": 4.6392336158786985e-06,
"loss": 0.3076,
"step": 5507
},
{
"epoch": 2.7227783957483624,
"grad_norm": 0.12238649502683503,
"learning_rate": 4.635946917602287e-06,
"loss": 0.2738,
"step": 5508
},
{
"epoch": 2.7232727722160424,
"grad_norm": 0.11449449415789102,
"learning_rate": 4.632661032645076e-06,
"loss": 0.3057,
"step": 5509
},
{
"epoch": 2.7237671486837227,
"grad_norm": 0.12230860368185631,
"learning_rate": 4.6293759615052946e-06,
"loss": 0.2835,
"step": 5510
},
{
"epoch": 2.7242615251514026,
"grad_norm": 0.12423535812452062,
"learning_rate": 4.626091704681028e-06,
"loss": 0.2778,
"step": 5511
},
{
"epoch": 2.724755901619083,
"grad_norm": 0.12024206018591103,
"learning_rate": 4.622808262670256e-06,
"loss": 0.2902,
"step": 5512
},
{
"epoch": 2.7252502780867633,
"grad_norm": 0.12000333407163971,
"learning_rate": 4.619525635970827e-06,
"loss": 0.2975,
"step": 5513
},
{
"epoch": 2.7257446545544433,
"grad_norm": 0.11821788200133053,
"learning_rate": 4.616243825080466e-06,
"loss": 0.2879,
"step": 5514
},
{
"epoch": 2.726239031022123,
"grad_norm": 0.11881533054055476,
"learning_rate": 4.612962830496767e-06,
"loss": 0.3033,
"step": 5515
},
{
"epoch": 2.7267334074898035,
"grad_norm": 0.11699132585950947,
"learning_rate": 4.609682652717218e-06,
"loss": 0.3172,
"step": 5516
},
{
"epoch": 2.727227783957484,
"grad_norm": 0.12821480563210633,
"learning_rate": 4.6064032922391624e-06,
"loss": 0.2968,
"step": 5517
},
{
"epoch": 2.727722160425164,
"grad_norm": 0.12697897664759228,
"learning_rate": 4.603124749559835e-06,
"loss": 0.2792,
"step": 5518
},
{
"epoch": 2.7282165368928437,
"grad_norm": 0.11972739043991681,
"learning_rate": 4.599847025176347e-06,
"loss": 0.3107,
"step": 5519
},
{
"epoch": 2.728710913360524,
"grad_norm": 0.11984436338251742,
"learning_rate": 4.596570119585671e-06,
"loss": 0.3079,
"step": 5520
},
{
"epoch": 2.7292052898282044,
"grad_norm": 0.11814768283334048,
"learning_rate": 4.593294033284671e-06,
"loss": 0.2877,
"step": 5521
},
{
"epoch": 2.7296996662958843,
"grad_norm": 0.1210179972723323,
"learning_rate": 4.590018766770074e-06,
"loss": 0.2983,
"step": 5522
},
{
"epoch": 2.7301940427635643,
"grad_norm": 0.12595927217764186,
"learning_rate": 4.5867443205384964e-06,
"loss": 0.2978,
"step": 5523
},
{
"epoch": 2.7306884192312446,
"grad_norm": 0.11788932237491996,
"learning_rate": 4.583470695086416e-06,
"loss": 0.2813,
"step": 5524
},
{
"epoch": 2.731182795698925,
"grad_norm": 0.12724156299441802,
"learning_rate": 4.5801978909102e-06,
"loss": 0.2931,
"step": 5525
},
{
"epoch": 2.731677172166605,
"grad_norm": 0.12334472821728301,
"learning_rate": 4.576925908506076e-06,
"loss": 0.2985,
"step": 5526
},
{
"epoch": 2.732171548634285,
"grad_norm": 0.11805548362707108,
"learning_rate": 4.573654748370163e-06,
"loss": 0.3003,
"step": 5527
},
{
"epoch": 2.732665925101965,
"grad_norm": 0.23630100784374738,
"learning_rate": 4.5703844109984395e-06,
"loss": 0.2906,
"step": 5528
},
{
"epoch": 2.7331603015696455,
"grad_norm": 0.11133783282058166,
"learning_rate": 4.567114896886773e-06,
"loss": 0.2756,
"step": 5529
},
{
"epoch": 2.7336546780373254,
"grad_norm": 0.12061368513821877,
"learning_rate": 4.563846206530901e-06,
"loss": 0.3004,
"step": 5530
},
{
"epoch": 2.7341490545050053,
"grad_norm": 0.11963102962435192,
"learning_rate": 4.5605783404264334e-06,
"loss": 0.2855,
"step": 5531
},
{
"epoch": 2.7346434309726857,
"grad_norm": 0.11974099831918385,
"learning_rate": 4.557311299068853e-06,
"loss": 0.3068,
"step": 5532
},
{
"epoch": 2.735137807440366,
"grad_norm": 0.1242473383519101,
"learning_rate": 4.554045082953525e-06,
"loss": 0.2769,
"step": 5533
},
{
"epoch": 2.735632183908046,
"grad_norm": 0.1179564462971841,
"learning_rate": 4.550779692575692e-06,
"loss": 0.3069,
"step": 5534
},
{
"epoch": 2.736126560375726,
"grad_norm": 0.1147150784525113,
"learning_rate": 4.547515128430455e-06,
"loss": 0.2784,
"step": 5535
},
{
"epoch": 2.7366209368434062,
"grad_norm": 0.11732858484357539,
"learning_rate": 4.544251391012809e-06,
"loss": 0.2694,
"step": 5536
},
{
"epoch": 2.7371153133110866,
"grad_norm": 0.11713851027200908,
"learning_rate": 4.540988480817613e-06,
"loss": 0.2904,
"step": 5537
},
{
"epoch": 2.7376096897787665,
"grad_norm": 0.11840506449392665,
"learning_rate": 4.537726398339597e-06,
"loss": 0.282,
"step": 5538
},
{
"epoch": 2.7381040662464464,
"grad_norm": 0.12338902385268707,
"learning_rate": 4.534465144073374e-06,
"loss": 0.3111,
"step": 5539
},
{
"epoch": 2.738598442714127,
"grad_norm": 0.11908664276993083,
"learning_rate": 4.5312047185134336e-06,
"loss": 0.296,
"step": 5540
},
{
"epoch": 2.739092819181807,
"grad_norm": 0.1200744031606582,
"learning_rate": 4.527945122154127e-06,
"loss": 0.2806,
"step": 5541
},
{
"epoch": 2.739587195649487,
"grad_norm": 0.12404751470220299,
"learning_rate": 4.524686355489693e-06,
"loss": 0.2989,
"step": 5542
},
{
"epoch": 2.740081572117167,
"grad_norm": 0.12071603944126488,
"learning_rate": 4.521428419014235e-06,
"loss": 0.3017,
"step": 5543
},
{
"epoch": 2.7405759485848473,
"grad_norm": 0.12293412620658836,
"learning_rate": 4.518171313221734e-06,
"loss": 0.2848,
"step": 5544
},
{
"epoch": 2.7410703250525277,
"grad_norm": 0.11813343513418634,
"learning_rate": 4.514915038606052e-06,
"loss": 0.2977,
"step": 5545
},
{
"epoch": 2.7415647015202076,
"grad_norm": 0.11484702834581202,
"learning_rate": 4.51165959566091e-06,
"loss": 0.2917,
"step": 5546
},
{
"epoch": 2.7420590779878875,
"grad_norm": 0.12106399506390317,
"learning_rate": 4.508404984879918e-06,
"loss": 0.3077,
"step": 5547
},
{
"epoch": 2.742553454455568,
"grad_norm": 0.13453831283731862,
"learning_rate": 4.50515120675655e-06,
"loss": 0.3125,
"step": 5548
},
{
"epoch": 2.743047830923248,
"grad_norm": 0.13086972671246036,
"learning_rate": 4.501898261784155e-06,
"loss": 0.3164,
"step": 5549
},
{
"epoch": 2.743542207390928,
"grad_norm": 0.1188312687307384,
"learning_rate": 4.498646150455957e-06,
"loss": 0.2955,
"step": 5550
},
{
"epoch": 2.7440365838586085,
"grad_norm": 0.12479611084848692,
"learning_rate": 4.495394873265061e-06,
"loss": 0.2889,
"step": 5551
},
{
"epoch": 2.7445309603262884,
"grad_norm": 0.12011295106955462,
"learning_rate": 4.492144430704432e-06,
"loss": 0.2958,
"step": 5552
},
{
"epoch": 2.7450253367939688,
"grad_norm": 0.1134124046481066,
"learning_rate": 4.4888948232669194e-06,
"loss": 0.2724,
"step": 5553
},
{
"epoch": 2.7455197132616487,
"grad_norm": 0.1118714857876748,
"learning_rate": 4.4856460514452405e-06,
"loss": 0.2989,
"step": 5554
},
{
"epoch": 2.746014089729329,
"grad_norm": 0.11631211581359886,
"learning_rate": 4.482398115731979e-06,
"loss": 0.2932,
"step": 5555
},
{
"epoch": 2.746508466197009,
"grad_norm": 0.11922095701259927,
"learning_rate": 4.479151016619615e-06,
"loss": 0.2812,
"step": 5556
},
{
"epoch": 2.7470028426646893,
"grad_norm": 0.1224020116714705,
"learning_rate": 4.4759047546004785e-06,
"loss": 0.2902,
"step": 5557
},
{
"epoch": 2.747497219132369,
"grad_norm": 0.12156576407126779,
"learning_rate": 4.472659330166777e-06,
"loss": 0.2941,
"step": 5558
},
{
"epoch": 2.7479915956000496,
"grad_norm": 0.11643638416943973,
"learning_rate": 4.469414743810603e-06,
"loss": 0.2928,
"step": 5559
},
{
"epoch": 2.7484859720677295,
"grad_norm": 0.12496679728300047,
"learning_rate": 4.466170996023905e-06,
"loss": 0.2994,
"step": 5560
},
{
"epoch": 2.74898034853541,
"grad_norm": 0.11873851705803685,
"learning_rate": 4.462928087298519e-06,
"loss": 0.2904,
"step": 5561
},
{
"epoch": 2.7494747250030898,
"grad_norm": 0.11199486465739161,
"learning_rate": 4.459686018126149e-06,
"loss": 0.2779,
"step": 5562
},
{
"epoch": 2.74996910147077,
"grad_norm": 0.1148318964818842,
"learning_rate": 4.456444788998369e-06,
"loss": 0.2845,
"step": 5563
},
{
"epoch": 2.75046347793845,
"grad_norm": 0.12208319283828141,
"learning_rate": 4.453204400406621e-06,
"loss": 0.2748,
"step": 5564
},
{
"epoch": 2.7509578544061304,
"grad_norm": 0.12188153302046052,
"learning_rate": 4.449964852842236e-06,
"loss": 0.2861,
"step": 5565
},
{
"epoch": 2.7514522308738103,
"grad_norm": 0.12182798777820712,
"learning_rate": 4.446726146796396e-06,
"loss": 0.3017,
"step": 5566
},
{
"epoch": 2.7514522308738103,
"eval_loss": 0.5019189119338989,
"eval_runtime": 100.9553,
"eval_samples_per_second": 300.668,
"eval_steps_per_second": 37.591,
"step": 5566
},
{
"epoch": 2.7519466073414907,
"grad_norm": 0.12059628567321018,
"learning_rate": 4.443488282760174e-06,
"loss": 0.3092,
"step": 5567
},
{
"epoch": 2.7524409838091706,
"grad_norm": 0.11810741638447714,
"learning_rate": 4.440251261224509e-06,
"loss": 0.2866,
"step": 5568
},
{
"epoch": 2.752935360276851,
"grad_norm": 0.11702030116014142,
"learning_rate": 4.437015082680208e-06,
"loss": 0.2987,
"step": 5569
},
{
"epoch": 2.753429736744531,
"grad_norm": 0.1226734773161813,
"learning_rate": 4.433779747617953e-06,
"loss": 0.3041,
"step": 5570
},
{
"epoch": 2.753924113212211,
"grad_norm": 0.12025485026520062,
"learning_rate": 4.4305452565282996e-06,
"loss": 0.2935,
"step": 5571
},
{
"epoch": 2.754418489679891,
"grad_norm": 0.11605539056392683,
"learning_rate": 4.427311609901671e-06,
"loss": 0.2911,
"step": 5572
},
{
"epoch": 2.7549128661475715,
"grad_norm": 0.11900905670395062,
"learning_rate": 4.424078808228374e-06,
"loss": 0.2837,
"step": 5573
},
{
"epoch": 2.7554072426152514,
"grad_norm": 0.12154418582783903,
"learning_rate": 4.420846851998574e-06,
"loss": 0.283,
"step": 5574
},
{
"epoch": 2.7559016190829317,
"grad_norm": 0.11580762119447079,
"learning_rate": 4.417615741702308e-06,
"loss": 0.283,
"step": 5575
},
{
"epoch": 2.7563959955506117,
"grad_norm": 0.11519764337818957,
"learning_rate": 4.4143854778294996e-06,
"loss": 0.3037,
"step": 5576
},
{
"epoch": 2.756890372018292,
"grad_norm": 0.26308531570057575,
"learning_rate": 4.4111560608699245e-06,
"loss": 0.3112,
"step": 5577
},
{
"epoch": 2.757384748485972,
"grad_norm": 0.12146088699301011,
"learning_rate": 4.407927491313245e-06,
"loss": 0.2654,
"step": 5578
},
{
"epoch": 2.7578791249536523,
"grad_norm": 0.11829968427954725,
"learning_rate": 4.404699769648993e-06,
"loss": 0.2918,
"step": 5579
},
{
"epoch": 2.758373501421332,
"grad_norm": 0.11785124519179183,
"learning_rate": 4.4014728963665654e-06,
"loss": 0.2956,
"step": 5580
},
{
"epoch": 2.7588678778890126,
"grad_norm": 0.11781448889586675,
"learning_rate": 4.3982468719552295e-06,
"loss": 0.3039,
"step": 5581
},
{
"epoch": 2.7593622543566925,
"grad_norm": 0.12015453653527707,
"learning_rate": 4.395021696904132e-06,
"loss": 0.2857,
"step": 5582
},
{
"epoch": 2.759856630824373,
"grad_norm": 0.1229508736798809,
"learning_rate": 4.39179737170229e-06,
"loss": 0.2922,
"step": 5583
},
{
"epoch": 2.7603510072920527,
"grad_norm": 0.12061714916944309,
"learning_rate": 4.388573896838581e-06,
"loss": 0.3106,
"step": 5584
},
{
"epoch": 2.760845383759733,
"grad_norm": 0.9071145271832266,
"learning_rate": 4.385351272801771e-06,
"loss": 0.3226,
"step": 5585
},
{
"epoch": 2.761339760227413,
"grad_norm": 0.11607711103650786,
"learning_rate": 4.38212950008048e-06,
"loss": 0.3,
"step": 5586
},
{
"epoch": 2.7618341366950934,
"grad_norm": 0.12374529405286179,
"learning_rate": 4.378908579163205e-06,
"loss": 0.2963,
"step": 5587
},
{
"epoch": 2.7623285131627737,
"grad_norm": 0.12602212668885215,
"learning_rate": 4.375688510538318e-06,
"loss": 0.2934,
"step": 5588
},
{
"epoch": 2.7628228896304536,
"grad_norm": 0.1323133039625717,
"learning_rate": 4.37246929469406e-06,
"loss": 0.2914,
"step": 5589
},
{
"epoch": 2.7633172660981336,
"grad_norm": 0.13685209461230247,
"learning_rate": 4.369250932118537e-06,
"loss": 0.2745,
"step": 5590
},
{
"epoch": 2.763811642565814,
"grad_norm": 0.12688275552883388,
"learning_rate": 4.366033423299737e-06,
"loss": 0.2913,
"step": 5591
},
{
"epoch": 2.7643060190334943,
"grad_norm": 0.13193481643346444,
"learning_rate": 4.362816768725503e-06,
"loss": 0.2804,
"step": 5592
},
{
"epoch": 2.764800395501174,
"grad_norm": 0.1206529305251512,
"learning_rate": 4.359600968883562e-06,
"loss": 0.2864,
"step": 5593
},
{
"epoch": 2.765294771968854,
"grad_norm": 0.13554916828096636,
"learning_rate": 4.356386024261508e-06,
"loss": 0.2934,
"step": 5594
},
{
"epoch": 2.7657891484365345,
"grad_norm": 0.1253559826876969,
"learning_rate": 4.3531719353467995e-06,
"loss": 0.2832,
"step": 5595
},
{
"epoch": 2.766283524904215,
"grad_norm": 0.11741577380704032,
"learning_rate": 4.349958702626775e-06,
"loss": 0.2726,
"step": 5596
},
{
"epoch": 2.7667779013718947,
"grad_norm": 0.1236450159086945,
"learning_rate": 4.346746326588634e-06,
"loss": 0.3053,
"step": 5597
},
{
"epoch": 2.7672722778395746,
"grad_norm": 0.12340524057514092,
"learning_rate": 4.343534807719446e-06,
"loss": 0.274,
"step": 5598
},
{
"epoch": 2.767766654307255,
"grad_norm": 0.1233177124157095,
"learning_rate": 4.34032414650616e-06,
"loss": 0.31,
"step": 5599
},
{
"epoch": 2.7682610307749353,
"grad_norm": 0.1241619517192944,
"learning_rate": 4.33711434343559e-06,
"loss": 0.3288,
"step": 5600
},
{
"epoch": 2.7687554072426153,
"grad_norm": 0.15463838397492172,
"learning_rate": 4.333905398994414e-06,
"loss": 0.2811,
"step": 5601
},
{
"epoch": 2.769249783710295,
"grad_norm": 0.11759232367101621,
"learning_rate": 4.330697313669191e-06,
"loss": 0.3041,
"step": 5602
},
{
"epoch": 2.7697441601779755,
"grad_norm": 0.11946605621248073,
"learning_rate": 4.3274900879463414e-06,
"loss": 0.2856,
"step": 5603
},
{
"epoch": 2.770238536645656,
"grad_norm": 0.12267119010462493,
"learning_rate": 4.324283722312148e-06,
"loss": 0.2824,
"step": 5604
},
{
"epoch": 2.770732913113336,
"grad_norm": 0.1204948516524664,
"learning_rate": 4.321078217252791e-06,
"loss": 0.2911,
"step": 5605
},
{
"epoch": 2.7712272895810157,
"grad_norm": 0.1279152039203681,
"learning_rate": 4.317873573254292e-06,
"loss": 0.3257,
"step": 5606
},
{
"epoch": 2.771721666048696,
"grad_norm": 0.12242663421258146,
"learning_rate": 4.31466979080255e-06,
"loss": 0.2743,
"step": 5607
},
{
"epoch": 2.7722160425163764,
"grad_norm": 0.12479495047588751,
"learning_rate": 4.31146687038334e-06,
"loss": 0.3236,
"step": 5608
},
{
"epoch": 2.7727104189840563,
"grad_norm": 0.1206941971099183,
"learning_rate": 4.308264812482296e-06,
"loss": 0.3099,
"step": 5609
},
{
"epoch": 2.7732047954517363,
"grad_norm": 0.12586932055519431,
"learning_rate": 4.305063617584931e-06,
"loss": 0.293,
"step": 5610
},
{
"epoch": 2.7736991719194166,
"grad_norm": 0.12669374326351965,
"learning_rate": 4.301863286176625e-06,
"loss": 0.2965,
"step": 5611
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.12211949371183799,
"learning_rate": 4.298663818742623e-06,
"loss": 0.2756,
"step": 5612
},
{
"epoch": 2.774687924854777,
"grad_norm": 0.12539552284339595,
"learning_rate": 4.2954652157680365e-06,
"loss": 0.3147,
"step": 5613
},
{
"epoch": 2.775182301322457,
"grad_norm": 0.12030108704908721,
"learning_rate": 4.292267477737859e-06,
"loss": 0.2995,
"step": 5614
},
{
"epoch": 2.775676677790137,
"grad_norm": 0.12086644763957607,
"learning_rate": 4.289070605136936e-06,
"loss": 0.2882,
"step": 5615
},
{
"epoch": 2.7761710542578175,
"grad_norm": 0.1170829156440895,
"learning_rate": 4.285874598449994e-06,
"loss": 0.2868,
"step": 5616
},
{
"epoch": 2.7766654307254974,
"grad_norm": 0.12507030772166447,
"learning_rate": 4.282679458161627e-06,
"loss": 0.3081,
"step": 5617
},
{
"epoch": 2.7771598071931773,
"grad_norm": 0.12431273558062081,
"learning_rate": 4.279485184756289e-06,
"loss": 0.3072,
"step": 5618
},
{
"epoch": 2.7776541836608577,
"grad_norm": 0.12048830940281066,
"learning_rate": 4.276291778718316e-06,
"loss": 0.3013,
"step": 5619
},
{
"epoch": 2.778148560128538,
"grad_norm": 0.1179580649479625,
"learning_rate": 4.273099240531901e-06,
"loss": 0.2856,
"step": 5620
},
{
"epoch": 2.778642936596218,
"grad_norm": 0.12113751355347059,
"learning_rate": 4.2699075706811e-06,
"loss": 0.2673,
"step": 5621
},
{
"epoch": 2.779137313063898,
"grad_norm": 0.12113330648389001,
"learning_rate": 4.266716769649864e-06,
"loss": 0.3047,
"step": 5622
},
{
"epoch": 2.7796316895315782,
"grad_norm": 0.12041418556352665,
"learning_rate": 4.263526837921988e-06,
"loss": 0.2908,
"step": 5623
},
{
"epoch": 2.7801260659992586,
"grad_norm": 0.12391323884500022,
"learning_rate": 4.260337775981137e-06,
"loss": 0.275,
"step": 5624
},
{
"epoch": 2.7806204424669385,
"grad_norm": 0.12190234734643336,
"learning_rate": 4.257149584310858e-06,
"loss": 0.2936,
"step": 5625
},
{
"epoch": 2.781114818934619,
"grad_norm": 0.1375037709353238,
"learning_rate": 4.253962263394547e-06,
"loss": 0.3044,
"step": 5626
},
{
"epoch": 2.781609195402299,
"grad_norm": 0.1303433151512609,
"learning_rate": 4.2507758137154865e-06,
"loss": 0.28,
"step": 5627
},
{
"epoch": 2.782103571869979,
"grad_norm": 0.12126875508403713,
"learning_rate": 4.24759023575682e-06,
"loss": 0.3108,
"step": 5628
},
{
"epoch": 2.782597948337659,
"grad_norm": 0.12525047703986247,
"learning_rate": 4.244405530001553e-06,
"loss": 0.3054,
"step": 5629
},
{
"epoch": 2.7830923248053394,
"grad_norm": 0.11492967621016933,
"learning_rate": 4.241221696932561e-06,
"loss": 0.2866,
"step": 5630
},
{
"epoch": 2.7835867012730193,
"grad_norm": 0.1276553767868089,
"learning_rate": 4.238038737032594e-06,
"loss": 0.2879,
"step": 5631
},
{
"epoch": 2.7840810777406997,
"grad_norm": 0.11590355351026611,
"learning_rate": 4.234856650784267e-06,
"loss": 0.2877,
"step": 5632
},
{
"epoch": 2.7845754542083796,
"grad_norm": 0.12825499282064143,
"learning_rate": 4.2316754386700544e-06,
"loss": 0.3061,
"step": 5633
},
{
"epoch": 2.78506983067606,
"grad_norm": 0.13382799364913858,
"learning_rate": 4.228495101172312e-06,
"loss": 0.3341,
"step": 5634
},
{
"epoch": 2.78556420714374,
"grad_norm": 0.12115695175317569,
"learning_rate": 4.225315638773246e-06,
"loss": 0.285,
"step": 5635
},
{
"epoch": 2.7860585836114202,
"grad_norm": 0.12339267307111892,
"learning_rate": 4.222137051954949e-06,
"loss": 0.2973,
"step": 5636
},
{
"epoch": 2.7865529600791,
"grad_norm": 0.11724091038211656,
"learning_rate": 4.2189593411993615e-06,
"loss": 0.3003,
"step": 5637
},
{
"epoch": 2.7870473365467805,
"grad_norm": 0.12900732697557737,
"learning_rate": 4.21578250698831e-06,
"loss": 0.2951,
"step": 5638
},
{
"epoch": 2.7875417130144604,
"grad_norm": 0.11681903192418892,
"learning_rate": 4.212606549803469e-06,
"loss": 0.2958,
"step": 5639
},
{
"epoch": 2.7880360894821408,
"grad_norm": 0.12386765407540784,
"learning_rate": 4.209431470126402e-06,
"loss": 0.2856,
"step": 5640
},
{
"epoch": 2.7885304659498207,
"grad_norm": 0.12311369089010762,
"learning_rate": 4.206257268438514e-06,
"loss": 0.3008,
"step": 5641
},
{
"epoch": 2.789024842417501,
"grad_norm": 0.11948886508789698,
"learning_rate": 4.203083945221098e-06,
"loss": 0.2807,
"step": 5642
},
{
"epoch": 2.789519218885181,
"grad_norm": 0.11509782535194495,
"learning_rate": 4.1999115009553075e-06,
"loss": 0.2775,
"step": 5643
},
{
"epoch": 2.7900135953528613,
"grad_norm": 0.12103771767675962,
"learning_rate": 4.196739936122155e-06,
"loss": 0.2896,
"step": 5644
},
{
"epoch": 2.790507971820541,
"grad_norm": 0.12011154537359503,
"learning_rate": 4.193569251202533e-06,
"loss": 0.2831,
"step": 5645
},
{
"epoch": 2.7910023482882216,
"grad_norm": 0.11907388728905409,
"learning_rate": 4.190399446677189e-06,
"loss": 0.2967,
"step": 5646
},
{
"epoch": 2.7914967247559015,
"grad_norm": 0.11467793643876774,
"learning_rate": 4.187230523026739e-06,
"loss": 0.2777,
"step": 5647
},
{
"epoch": 2.791991101223582,
"grad_norm": 0.1185109108930619,
"learning_rate": 4.184062480731671e-06,
"loss": 0.3002,
"step": 5648
},
{
"epoch": 2.7924854776912618,
"grad_norm": 0.17983299172447295,
"learning_rate": 4.180895320272339e-06,
"loss": 0.3082,
"step": 5649
},
{
"epoch": 2.792979854158942,
"grad_norm": 0.11788433951422321,
"learning_rate": 4.177729042128955e-06,
"loss": 0.2847,
"step": 5650
},
{
"epoch": 2.793474230626622,
"grad_norm": 0.1166153490264969,
"learning_rate": 4.174563646781608e-06,
"loss": 0.2901,
"step": 5651
},
{
"epoch": 2.7939686070943024,
"grad_norm": 0.11720498414579177,
"learning_rate": 4.171399134710248e-06,
"loss": 0.2941,
"step": 5652
},
{
"epoch": 2.7944629835619823,
"grad_norm": 0.12199999411769366,
"learning_rate": 4.168235506394679e-06,
"loss": 0.3096,
"step": 5653
},
{
"epoch": 2.7949573600296627,
"grad_norm": 0.12242616162355957,
"learning_rate": 4.1650727623146e-06,
"loss": 0.2966,
"step": 5654
},
{
"epoch": 2.7954517364973426,
"grad_norm": 0.12133434911368969,
"learning_rate": 4.161910902949552e-06,
"loss": 0.3211,
"step": 5655
},
{
"epoch": 2.795946112965023,
"grad_norm": 0.12155157806058217,
"learning_rate": 4.158749928778944e-06,
"loss": 0.2936,
"step": 5656
},
{
"epoch": 2.796440489432703,
"grad_norm": 0.12178565454036633,
"learning_rate": 4.155589840282063e-06,
"loss": 0.2965,
"step": 5657
},
{
"epoch": 2.796934865900383,
"grad_norm": 0.12201321070476835,
"learning_rate": 4.152430637938048e-06,
"loss": 0.3012,
"step": 5658
},
{
"epoch": 2.797429242368063,
"grad_norm": 0.11711041426759143,
"learning_rate": 4.149272322225913e-06,
"loss": 0.2922,
"step": 5659
},
{
"epoch": 2.7979236188357435,
"grad_norm": 0.11952864398092898,
"learning_rate": 4.146114893624537e-06,
"loss": 0.2937,
"step": 5660
},
{
"epoch": 2.7984179953034234,
"grad_norm": 0.11535176384422428,
"learning_rate": 4.142958352612656e-06,
"loss": 0.2983,
"step": 5661
},
{
"epoch": 2.7989123717711037,
"grad_norm": 0.11420295686936001,
"learning_rate": 4.1398026996688844e-06,
"loss": 0.2947,
"step": 5662
},
{
"epoch": 2.799406748238784,
"grad_norm": 0.11698930564271282,
"learning_rate": 4.136647935271691e-06,
"loss": 0.2898,
"step": 5663
},
{
"epoch": 2.799901124706464,
"grad_norm": 0.11019635550261851,
"learning_rate": 4.133494059899411e-06,
"loss": 0.2769,
"step": 5664
},
{
"epoch": 2.800395501174144,
"grad_norm": 0.38287500603910746,
"learning_rate": 4.130341074030251e-06,
"loss": 0.2937,
"step": 5665
},
{
"epoch": 2.8008898776418243,
"grad_norm": 0.11404964415252475,
"learning_rate": 4.127188978142282e-06,
"loss": 0.2686,
"step": 5666
},
{
"epoch": 2.8013842541095046,
"grad_norm": 0.11742019167747905,
"learning_rate": 4.1240377727134305e-06,
"loss": 0.2768,
"step": 5667
},
{
"epoch": 2.8018786305771846,
"grad_norm": 0.11760572912368528,
"learning_rate": 4.120887458221502e-06,
"loss": 0.2894,
"step": 5668
},
{
"epoch": 2.8023730070448645,
"grad_norm": 0.11707222721522323,
"learning_rate": 4.117738035144158e-06,
"loss": 0.2863,
"step": 5669
},
{
"epoch": 2.802867383512545,
"grad_norm": 0.11663735447060573,
"learning_rate": 4.114589503958917e-06,
"loss": 0.309,
"step": 5670
},
{
"epoch": 2.803361759980225,
"grad_norm": 0.11461200327325209,
"learning_rate": 4.111441865143187e-06,
"loss": 0.268,
"step": 5671
},
{
"epoch": 2.803856136447905,
"grad_norm": 0.12187578240120689,
"learning_rate": 4.108295119174219e-06,
"loss": 0.2963,
"step": 5672
},
{
"epoch": 2.804350512915585,
"grad_norm": 0.12246351159345434,
"learning_rate": 4.105149266529133e-06,
"loss": 0.3214,
"step": 5673
},
{
"epoch": 2.8048448893832654,
"grad_norm": 0.14496455892659063,
"learning_rate": 4.102004307684919e-06,
"loss": 0.2866,
"step": 5674
},
{
"epoch": 2.8053392658509457,
"grad_norm": 0.12189988348953124,
"learning_rate": 4.098860243118424e-06,
"loss": 0.2939,
"step": 5675
},
{
"epoch": 2.8058336423186256,
"grad_norm": 0.11648018409391658,
"learning_rate": 4.095717073306367e-06,
"loss": 0.279,
"step": 5676
},
{
"epoch": 2.8063280187863056,
"grad_norm": 0.12537417878955118,
"learning_rate": 4.09257479872533e-06,
"loss": 0.2864,
"step": 5677
},
{
"epoch": 2.806822395253986,
"grad_norm": 0.11838521255023049,
"learning_rate": 4.089433419851757e-06,
"loss": 0.2978,
"step": 5678
},
{
"epoch": 2.8073167717216663,
"grad_norm": 0.11538412981061844,
"learning_rate": 4.08629293716195e-06,
"loss": 0.2985,
"step": 5679
},
{
"epoch": 2.807811148189346,
"grad_norm": 0.11948183417636185,
"learning_rate": 4.083153351132089e-06,
"loss": 0.2937,
"step": 5680
},
{
"epoch": 2.808305524657026,
"grad_norm": 0.12059249723846127,
"learning_rate": 4.080014662238203e-06,
"loss": 0.2933,
"step": 5681
},
{
"epoch": 2.8087999011247065,
"grad_norm": 0.11572539993248067,
"learning_rate": 4.076876870956198e-06,
"loss": 0.2902,
"step": 5682
},
{
"epoch": 2.809294277592387,
"grad_norm": 0.11978440006642727,
"learning_rate": 4.073739977761841e-06,
"loss": 0.3087,
"step": 5683
},
{
"epoch": 2.8097886540600667,
"grad_norm": 0.12825645662586557,
"learning_rate": 4.070603983130754e-06,
"loss": 0.3038,
"step": 5684
},
{
"epoch": 2.8102830305277466,
"grad_norm": 0.1164055080595423,
"learning_rate": 4.067468887538435e-06,
"loss": 0.2867,
"step": 5685
},
{
"epoch": 2.810777406995427,
"grad_norm": 0.11682084360638789,
"learning_rate": 4.064334691460232e-06,
"loss": 0.2796,
"step": 5686
},
{
"epoch": 2.8112717834631074,
"grad_norm": 0.11566359776445388,
"learning_rate": 4.061201395371373e-06,
"loss": 0.3021,
"step": 5687
},
{
"epoch": 2.8117661599307873,
"grad_norm": 0.12192639738857647,
"learning_rate": 4.058068999746935e-06,
"loss": 0.2893,
"step": 5688
},
{
"epoch": 2.812260536398467,
"grad_norm": 0.11950073797456329,
"learning_rate": 4.054937505061868e-06,
"loss": 0.304,
"step": 5689
},
{
"epoch": 2.8127549128661475,
"grad_norm": 0.12044590822409042,
"learning_rate": 4.051806911790977e-06,
"loss": 0.2917,
"step": 5690
},
{
"epoch": 2.813249289333828,
"grad_norm": 0.12496559619196397,
"learning_rate": 4.048677220408942e-06,
"loss": 0.2722,
"step": 5691
},
{
"epoch": 2.813743665801508,
"grad_norm": 0.11443871711039265,
"learning_rate": 4.045548431390291e-06,
"loss": 0.2979,
"step": 5692
},
{
"epoch": 2.8142380422691877,
"grad_norm": 0.119444280092338,
"learning_rate": 4.042420545209429e-06,
"loss": 0.3046,
"step": 5693
},
{
"epoch": 2.814732418736868,
"grad_norm": 0.11962358633520749,
"learning_rate": 4.0392935623406205e-06,
"loss": 0.2856,
"step": 5694
},
{
"epoch": 2.8152267952045484,
"grad_norm": 0.11814420192337839,
"learning_rate": 4.036167483257989e-06,
"loss": 0.3119,
"step": 5695
},
{
"epoch": 2.8157211716722284,
"grad_norm": 0.11749106040589119,
"learning_rate": 4.033042308435519e-06,
"loss": 0.2826,
"step": 5696
},
{
"epoch": 2.8162155481399083,
"grad_norm": 0.12001777766540733,
"learning_rate": 4.029918038347064e-06,
"loss": 0.2925,
"step": 5697
},
{
"epoch": 2.8167099246075886,
"grad_norm": 0.1184026468650932,
"learning_rate": 4.026794673466344e-06,
"loss": 0.2966,
"step": 5698
},
{
"epoch": 2.817204301075269,
"grad_norm": 0.1156309957612324,
"learning_rate": 4.023672214266928e-06,
"loss": 0.2749,
"step": 5699
},
{
"epoch": 2.817698677542949,
"grad_norm": 0.11966974483604555,
"learning_rate": 4.020550661222264e-06,
"loss": 0.2842,
"step": 5700
},
{
"epoch": 2.8181930540106293,
"grad_norm": 0.12353459228122561,
"learning_rate": 4.017430014805649e-06,
"loss": 0.3148,
"step": 5701
},
{
"epoch": 2.818687430478309,
"grad_norm": 0.11842045970998637,
"learning_rate": 4.014310275490245e-06,
"loss": 0.2908,
"step": 5702
},
{
"epoch": 2.8191818069459895,
"grad_norm": 0.11706737961476252,
"learning_rate": 4.011191443749085e-06,
"loss": 0.2883,
"step": 5703
},
{
"epoch": 2.8196761834136694,
"grad_norm": 0.11497515498108739,
"learning_rate": 4.008073520055059e-06,
"loss": 0.265,
"step": 5704
},
{
"epoch": 2.82017055988135,
"grad_norm": 0.11595232833855851,
"learning_rate": 4.004956504880914e-06,
"loss": 0.3037,
"step": 5705
},
{
"epoch": 2.8206649363490297,
"grad_norm": 0.12229152124720524,
"learning_rate": 4.001840398699271e-06,
"loss": 0.2908,
"step": 5706
},
{
"epoch": 2.82115931281671,
"grad_norm": 0.11962322668252941,
"learning_rate": 3.9987252019825995e-06,
"loss": 0.2845,
"step": 5707
},
{
"epoch": 2.82165368928439,
"grad_norm": 0.1198732456593222,
"learning_rate": 3.995610915203241e-06,
"loss": 0.2867,
"step": 5708
},
{
"epoch": 2.8221480657520703,
"grad_norm": 0.12572846990252093,
"learning_rate": 3.9924975388334004e-06,
"loss": 0.2713,
"step": 5709
},
{
"epoch": 2.8226424422197502,
"grad_norm": 0.11723328790261682,
"learning_rate": 3.9893850733451336e-06,
"loss": 0.2736,
"step": 5710
},
{
"epoch": 2.8231368186874306,
"grad_norm": 0.1178764921516918,
"learning_rate": 3.98627351921037e-06,
"loss": 0.3118,
"step": 5711
},
{
"epoch": 2.8236311951551105,
"grad_norm": 0.12002331418060162,
"learning_rate": 3.983162876900896e-06,
"loss": 0.2884,
"step": 5712
},
{
"epoch": 2.824125571622791,
"grad_norm": 0.11902508178813838,
"learning_rate": 3.9800531468883515e-06,
"loss": 0.296,
"step": 5713
},
{
"epoch": 2.824619948090471,
"grad_norm": 0.12068891797829502,
"learning_rate": 3.976944329644254e-06,
"loss": 0.2735,
"step": 5714
},
{
"epoch": 2.825114324558151,
"grad_norm": 0.11763704814510212,
"learning_rate": 3.973836425639976e-06,
"loss": 0.2917,
"step": 5715
},
{
"epoch": 2.825608701025831,
"grad_norm": 0.11646830227590263,
"learning_rate": 3.970729435346744e-06,
"loss": 0.2719,
"step": 5716
},
{
"epoch": 2.8261030774935114,
"grad_norm": 0.11541637347837479,
"learning_rate": 3.9676233592356595e-06,
"loss": 0.2824,
"step": 5717
},
{
"epoch": 2.8265974539611913,
"grad_norm": 0.12182381438962502,
"learning_rate": 3.964518197777673e-06,
"loss": 0.3069,
"step": 5718
},
{
"epoch": 2.8270918304288717,
"grad_norm": 0.11844936662305328,
"learning_rate": 3.961413951443598e-06,
"loss": 0.2918,
"step": 5719
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.12413040409372006,
"learning_rate": 3.958310620704125e-06,
"loss": 0.2975,
"step": 5720
},
{
"epoch": 2.828080583364232,
"grad_norm": 0.11740862394281805,
"learning_rate": 3.9552082060297835e-06,
"loss": 0.2994,
"step": 5721
},
{
"epoch": 2.828574959831912,
"grad_norm": 0.12279114492495077,
"learning_rate": 3.952106707890975e-06,
"loss": 0.2863,
"step": 5722
},
{
"epoch": 2.8290693362995922,
"grad_norm": 0.1326591313274138,
"learning_rate": 3.949006126757966e-06,
"loss": 0.2765,
"step": 5723
},
{
"epoch": 2.829563712767272,
"grad_norm": 0.11546040077153799,
"learning_rate": 3.9459064631008715e-06,
"loss": 0.2782,
"step": 5724
},
{
"epoch": 2.8300580892349525,
"grad_norm": 0.11359806134850427,
"learning_rate": 3.94280771738968e-06,
"loss": 0.2805,
"step": 5725
},
{
"epoch": 2.8305524657026324,
"grad_norm": 0.12403092226250882,
"learning_rate": 3.939709890094237e-06,
"loss": 0.2995,
"step": 5726
},
{
"epoch": 2.8310468421703128,
"grad_norm": 0.11983492476513233,
"learning_rate": 3.936612981684247e-06,
"loss": 0.2816,
"step": 5727
},
{
"epoch": 2.8315412186379927,
"grad_norm": 0.11812545679696738,
"learning_rate": 3.9335169926292704e-06,
"loss": 0.2702,
"step": 5728
},
{
"epoch": 2.832035595105673,
"grad_norm": 0.11812484088385806,
"learning_rate": 3.93042192339874e-06,
"loss": 0.2924,
"step": 5729
},
{
"epoch": 2.832529971573353,
"grad_norm": 0.11721411269292178,
"learning_rate": 3.927327774461937e-06,
"loss": 0.2817,
"step": 5730
},
{
"epoch": 2.8330243480410333,
"grad_norm": 0.11903210754060345,
"learning_rate": 3.924234546288009e-06,
"loss": 0.3028,
"step": 5731
},
{
"epoch": 2.8335187245087132,
"grad_norm": 0.11927717463208358,
"learning_rate": 3.921142239345972e-06,
"loss": 0.2636,
"step": 5732
},
{
"epoch": 2.8340131009763936,
"grad_norm": 0.11653632568809963,
"learning_rate": 3.918050854104683e-06,
"loss": 0.2787,
"step": 5733
},
{
"epoch": 2.8345074774440735,
"grad_norm": 0.11141985667731802,
"learning_rate": 3.914960391032879e-06,
"loss": 0.2755,
"step": 5734
},
{
"epoch": 2.835001853911754,
"grad_norm": 0.11863829023898069,
"learning_rate": 3.911870850599141e-06,
"loss": 0.2974,
"step": 5735
},
{
"epoch": 2.8354962303794338,
"grad_norm": 0.11885133275527827,
"learning_rate": 3.908782233271921e-06,
"loss": 0.2841,
"step": 5736
},
{
"epoch": 2.835990606847114,
"grad_norm": 0.12329550864501415,
"learning_rate": 3.905694539519531e-06,
"loss": 0.2959,
"step": 5737
},
{
"epoch": 2.8364849833147945,
"grad_norm": 0.12259815221401005,
"learning_rate": 3.9026077698101364e-06,
"loss": 0.2977,
"step": 5738
},
{
"epoch": 2.8369793597824744,
"grad_norm": 0.11979794689686123,
"learning_rate": 3.899521924611761e-06,
"loss": 0.2961,
"step": 5739
},
{
"epoch": 2.8374737362501543,
"grad_norm": 0.11988382138310195,
"learning_rate": 3.896437004392301e-06,
"loss": 0.2868,
"step": 5740
},
{
"epoch": 2.8379681127178347,
"grad_norm": 0.11401148567733657,
"learning_rate": 3.893353009619497e-06,
"loss": 0.2801,
"step": 5741
},
{
"epoch": 2.838462489185515,
"grad_norm": 0.12063500397070746,
"learning_rate": 3.890269940760961e-06,
"loss": 0.2936,
"step": 5742
},
{
"epoch": 2.838956865653195,
"grad_norm": 0.11551029444666677,
"learning_rate": 3.887187798284162e-06,
"loss": 0.2839,
"step": 5743
},
{
"epoch": 2.839451242120875,
"grad_norm": 0.11399930190875657,
"learning_rate": 3.884106582656425e-06,
"loss": 0.2914,
"step": 5744
},
{
"epoch": 2.839945618588555,
"grad_norm": 0.11707877178749798,
"learning_rate": 3.881026294344932e-06,
"loss": 0.2722,
"step": 5745
},
{
"epoch": 2.8404399950562356,
"grad_norm": 0.12114416071855627,
"learning_rate": 3.877946933816731e-06,
"loss": 0.2973,
"step": 5746
},
{
"epoch": 2.8409343715239155,
"grad_norm": 0.1199224386301225,
"learning_rate": 3.874868501538732e-06,
"loss": 0.273,
"step": 5747
},
{
"epoch": 2.8414287479915954,
"grad_norm": 0.12635165804163823,
"learning_rate": 3.871790997977692e-06,
"loss": 0.2921,
"step": 5748
},
{
"epoch": 2.8419231244592758,
"grad_norm": 0.11901448064057289,
"learning_rate": 3.868714423600242e-06,
"loss": 0.3048,
"step": 5749
},
{
"epoch": 2.842417500926956,
"grad_norm": 0.11821190415345369,
"learning_rate": 3.865638778872859e-06,
"loss": 0.2919,
"step": 5750
},
{
"epoch": 2.842911877394636,
"grad_norm": 0.11707728730413144,
"learning_rate": 3.8625640642618824e-06,
"loss": 0.2743,
"step": 5751
},
{
"epoch": 2.843406253862316,
"grad_norm": 0.11796975864250178,
"learning_rate": 3.859490280233516e-06,
"loss": 0.2884,
"step": 5752
},
{
"epoch": 2.8439006303299963,
"grad_norm": 0.11992335391971647,
"learning_rate": 3.856417427253824e-06,
"loss": 0.2856,
"step": 5753
},
{
"epoch": 2.8443950067976767,
"grad_norm": 0.12343241610014231,
"learning_rate": 3.853345505788716e-06,
"loss": 0.2963,
"step": 5754
},
{
"epoch": 2.8448893832653566,
"grad_norm": 0.12919588645863073,
"learning_rate": 3.850274516303977e-06,
"loss": 0.3006,
"step": 5755
},
{
"epoch": 2.8453837597330365,
"grad_norm": 0.11896008879597784,
"learning_rate": 3.847204459265234e-06,
"loss": 0.2807,
"step": 5756
},
{
"epoch": 2.845878136200717,
"grad_norm": 0.1252772495815424,
"learning_rate": 3.844135335137989e-06,
"loss": 0.3115,
"step": 5757
},
{
"epoch": 2.846372512668397,
"grad_norm": 0.12073514418470091,
"learning_rate": 3.841067144387594e-06,
"loss": 0.2779,
"step": 5758
},
{
"epoch": 2.846866889136077,
"grad_norm": 0.12844844469265806,
"learning_rate": 3.837999887479253e-06,
"loss": 0.2922,
"step": 5759
},
{
"epoch": 2.847361265603757,
"grad_norm": 0.12212987808581428,
"learning_rate": 3.834933564878048e-06,
"loss": 0.2981,
"step": 5760
},
{
"epoch": 2.8478556420714374,
"grad_norm": 0.11883474977832705,
"learning_rate": 3.831868177048897e-06,
"loss": 0.2954,
"step": 5761
},
{
"epoch": 2.8483500185391177,
"grad_norm": 0.12205794430582033,
"learning_rate": 3.828803724456589e-06,
"loss": 0.2834,
"step": 5762
},
{
"epoch": 2.8488443950067976,
"grad_norm": 0.11771545329115524,
"learning_rate": 3.8257402075657675e-06,
"loss": 0.2912,
"step": 5763
},
{
"epoch": 2.8493387714744776,
"grad_norm": 0.12121040566779186,
"learning_rate": 3.822677626840942e-06,
"loss": 0.2773,
"step": 5764
},
{
"epoch": 2.849833147942158,
"grad_norm": 0.11669807957850513,
"learning_rate": 3.819615982746463e-06,
"loss": 0.2837,
"step": 5765
},
{
"epoch": 2.8503275244098383,
"grad_norm": 0.12627833148380438,
"learning_rate": 3.816555275746558e-06,
"loss": 0.2952,
"step": 5766
},
{
"epoch": 2.850821900877518,
"grad_norm": 0.11660133907682999,
"learning_rate": 3.8134955063053016e-06,
"loss": 0.276,
"step": 5767
},
{
"epoch": 2.851316277345198,
"grad_norm": 0.11982712276863017,
"learning_rate": 3.8104366748866197e-06,
"loss": 0.3092,
"step": 5768
},
{
"epoch": 2.8518106538128785,
"grad_norm": 0.11717524833239831,
"learning_rate": 3.8073787819543175e-06,
"loss": 0.2873,
"step": 5769
},
{
"epoch": 2.852305030280559,
"grad_norm": 0.12022823411900199,
"learning_rate": 3.8043218279720396e-06,
"loss": 0.2912,
"step": 5770
},
{
"epoch": 2.8527994067482387,
"grad_norm": 0.11298315487282405,
"learning_rate": 3.8012658134032896e-06,
"loss": 0.3009,
"step": 5771
},
{
"epoch": 2.853293783215919,
"grad_norm": 0.11843429445728988,
"learning_rate": 3.7982107387114396e-06,
"loss": 0.2884,
"step": 5772
},
{
"epoch": 2.853788159683599,
"grad_norm": 0.11798694157635432,
"learning_rate": 3.7951566043597055e-06,
"loss": 0.2664,
"step": 5773
},
{
"epoch": 2.8542825361512794,
"grad_norm": 0.12136153437064161,
"learning_rate": 3.792103410811171e-06,
"loss": 0.3014,
"step": 5774
},
{
"epoch": 2.8547769126189593,
"grad_norm": 0.12161414869291778,
"learning_rate": 3.789051158528776e-06,
"loss": 0.2884,
"step": 5775
},
{
"epoch": 2.8552712890866396,
"grad_norm": 0.11808095018116899,
"learning_rate": 3.7859998479753134e-06,
"loss": 0.2756,
"step": 5776
},
{
"epoch": 2.8557656655543195,
"grad_norm": 0.11704551802081782,
"learning_rate": 3.7829494796134304e-06,
"loss": 0.2972,
"step": 5777
},
{
"epoch": 2.856260042022,
"grad_norm": 0.119544910802182,
"learning_rate": 3.779900053905643e-06,
"loss": 0.2955,
"step": 5778
},
{
"epoch": 2.85675441848968,
"grad_norm": 0.12286864727658509,
"learning_rate": 3.7768515713143106e-06,
"loss": 0.2805,
"step": 5779
},
{
"epoch": 2.85724879495736,
"grad_norm": 0.1451072524396735,
"learning_rate": 3.77380403230166e-06,
"loss": 0.3416,
"step": 5780
},
{
"epoch": 2.85774317142504,
"grad_norm": 0.11457118201260277,
"learning_rate": 3.770757437329775e-06,
"loss": 0.284,
"step": 5781
},
{
"epoch": 2.8582375478927204,
"grad_norm": 0.12416531585106642,
"learning_rate": 3.767711786860585e-06,
"loss": 0.2771,
"step": 5782
},
{
"epoch": 2.8587319243604004,
"grad_norm": 0.11300488181840922,
"learning_rate": 3.7646670813558915e-06,
"loss": 0.3056,
"step": 5783
},
{
"epoch": 2.8592263008280807,
"grad_norm": 0.11995025414778653,
"learning_rate": 3.76162332127734e-06,
"loss": 0.2867,
"step": 5784
},
{
"epoch": 2.8597206772957606,
"grad_norm": 0.11196998650910675,
"learning_rate": 3.758580507086432e-06,
"loss": 0.2879,
"step": 5785
},
{
"epoch": 2.860215053763441,
"grad_norm": 0.1209170365614516,
"learning_rate": 3.7555386392445447e-06,
"loss": 0.29,
"step": 5786
},
{
"epoch": 2.860709430231121,
"grad_norm": 0.11402121421820917,
"learning_rate": 3.752497718212892e-06,
"loss": 0.3036,
"step": 5787
},
{
"epoch": 2.8612038066988013,
"grad_norm": 0.1216929464646724,
"learning_rate": 3.749457744452545e-06,
"loss": 0.2845,
"step": 5788
},
{
"epoch": 2.861698183166481,
"grad_norm": 0.12829736689083823,
"learning_rate": 3.746418718424445e-06,
"loss": 0.3277,
"step": 5789
},
{
"epoch": 2.8621925596341615,
"grad_norm": 0.12236123098851662,
"learning_rate": 3.7433806405893745e-06,
"loss": 0.3083,
"step": 5790
},
{
"epoch": 2.8626869361018414,
"grad_norm": 0.11775348763460189,
"learning_rate": 3.7403435114079823e-06,
"loss": 0.3011,
"step": 5791
},
{
"epoch": 2.863181312569522,
"grad_norm": 0.12197966696839598,
"learning_rate": 3.737307331340774e-06,
"loss": 0.2938,
"step": 5792
},
{
"epoch": 2.8636756890372017,
"grad_norm": 0.12069175729871809,
"learning_rate": 3.734272100848103e-06,
"loss": 0.3162,
"step": 5793
},
{
"epoch": 2.864170065504882,
"grad_norm": 0.12351785482770584,
"learning_rate": 3.73123782039018e-06,
"loss": 0.3039,
"step": 5794
},
{
"epoch": 2.864664441972562,
"grad_norm": 0.12093339553532188,
"learning_rate": 3.728204490427079e-06,
"loss": 0.3131,
"step": 5795
},
{
"epoch": 2.8651588184402423,
"grad_norm": 0.11521334592450144,
"learning_rate": 3.7251721114187266e-06,
"loss": 0.2811,
"step": 5796
},
{
"epoch": 2.8656531949079223,
"grad_norm": 0.12107767889306013,
"learning_rate": 3.7221406838249006e-06,
"loss": 0.2938,
"step": 5797
},
{
"epoch": 2.8661475713756026,
"grad_norm": 0.11509469327392967,
"learning_rate": 3.7191102081052433e-06,
"loss": 0.2898,
"step": 5798
},
{
"epoch": 2.8666419478432825,
"grad_norm": 0.1197179659003399,
"learning_rate": 3.716080684719241e-06,
"loss": 0.3112,
"step": 5799
},
{
"epoch": 2.867136324310963,
"grad_norm": 0.12816777703831547,
"learning_rate": 3.713052114126249e-06,
"loss": 0.3081,
"step": 5800
},
{
"epoch": 2.867630700778643,
"grad_norm": 0.12056664129635677,
"learning_rate": 3.710024496785464e-06,
"loss": 0.2968,
"step": 5801
},
{
"epoch": 2.868125077246323,
"grad_norm": 0.11524848852989282,
"learning_rate": 3.706997833155953e-06,
"loss": 0.2902,
"step": 5802
},
{
"epoch": 2.868619453714003,
"grad_norm": 0.11213687684696778,
"learning_rate": 3.7039721236966243e-06,
"loss": 0.2853,
"step": 5803
},
{
"epoch": 2.8691138301816834,
"grad_norm": 0.12037170045709149,
"learning_rate": 3.7009473688662533e-06,
"loss": 0.2983,
"step": 5804
},
{
"epoch": 2.8696082066493633,
"grad_norm": 0.12027672350008742,
"learning_rate": 3.6979235691234606e-06,
"loss": 0.2738,
"step": 5805
},
{
"epoch": 2.8701025831170437,
"grad_norm": 0.11782020434089555,
"learning_rate": 3.6949007249267286e-06,
"loss": 0.3023,
"step": 5806
},
{
"epoch": 2.8705969595847236,
"grad_norm": 0.12152587629388996,
"learning_rate": 3.6918788367343984e-06,
"loss": 0.2819,
"step": 5807
},
{
"epoch": 2.871091336052404,
"grad_norm": 0.11961598681584758,
"learning_rate": 3.6888579050046515e-06,
"loss": 0.3027,
"step": 5808
},
{
"epoch": 2.8715857125200843,
"grad_norm": 0.11457456681928772,
"learning_rate": 3.6858379301955427e-06,
"loss": 0.3071,
"step": 5809
},
{
"epoch": 2.8720800889877642,
"grad_norm": 0.11887498811725809,
"learning_rate": 3.6828189127649683e-06,
"loss": 0.2723,
"step": 5810
},
{
"epoch": 2.872574465455444,
"grad_norm": 0.12190561608573074,
"learning_rate": 3.6798008531706796e-06,
"loss": 0.2802,
"step": 5811
},
{
"epoch": 2.8730688419231245,
"grad_norm": 0.12108665209404271,
"learning_rate": 3.676783751870291e-06,
"loss": 0.2848,
"step": 5812
},
{
"epoch": 2.873563218390805,
"grad_norm": 0.11635774537666825,
"learning_rate": 3.6737676093212716e-06,
"loss": 0.2751,
"step": 5813
},
{
"epoch": 2.8740575948584848,
"grad_norm": 0.1172069954097249,
"learning_rate": 3.6707524259809334e-06,
"loss": 0.2685,
"step": 5814
},
{
"epoch": 2.8745519713261647,
"grad_norm": 0.11145494204278031,
"learning_rate": 3.6677382023064577e-06,
"loss": 0.2943,
"step": 5815
},
{
"epoch": 2.875046347793845,
"grad_norm": 0.11692508782650475,
"learning_rate": 3.66472493875487e-06,
"loss": 0.2984,
"step": 5816
},
{
"epoch": 2.8755407242615254,
"grad_norm": 0.5793500607175854,
"learning_rate": 3.6617126357830458e-06,
"loss": 0.3102,
"step": 5817
},
{
"epoch": 2.8760351007292053,
"grad_norm": 0.11939048017212324,
"learning_rate": 3.658701293847736e-06,
"loss": 0.2957,
"step": 5818
},
{
"epoch": 2.8765294771968852,
"grad_norm": 0.12067289949233338,
"learning_rate": 3.6556909134055276e-06,
"loss": 0.3009,
"step": 5819
},
{
"epoch": 2.8770238536645656,
"grad_norm": 0.11789124473899242,
"learning_rate": 3.65268149491286e-06,
"loss": 0.267,
"step": 5820
},
{
"epoch": 2.877518230132246,
"grad_norm": 0.12277304477784244,
"learning_rate": 3.649673038826043e-06,
"loss": 0.3007,
"step": 5821
},
{
"epoch": 2.878012606599926,
"grad_norm": 0.11233127176414369,
"learning_rate": 3.646665545601221e-06,
"loss": 0.2974,
"step": 5822
},
{
"epoch": 2.8785069830676058,
"grad_norm": 0.12416427260198759,
"learning_rate": 3.6436590156944087e-06,
"loss": 0.2863,
"step": 5823
},
{
"epoch": 2.879001359535286,
"grad_norm": 0.12441026330277948,
"learning_rate": 3.64065344956147e-06,
"loss": 0.2842,
"step": 5824
},
{
"epoch": 2.8794957360029665,
"grad_norm": 0.1255305072744874,
"learning_rate": 3.637648847658113e-06,
"loss": 0.2923,
"step": 5825
},
{
"epoch": 2.8799901124706464,
"grad_norm": 0.12063531576949357,
"learning_rate": 3.6346452104399165e-06,
"loss": 0.3028,
"step": 5826
},
{
"epoch": 2.8804844889383263,
"grad_norm": 0.11913321030160358,
"learning_rate": 3.631642538362299e-06,
"loss": 0.2766,
"step": 5827
},
{
"epoch": 2.8809788654060067,
"grad_norm": 0.3110773641065992,
"learning_rate": 3.6286408318805342e-06,
"loss": 0.2904,
"step": 5828
},
{
"epoch": 2.881473241873687,
"grad_norm": 0.1196492223237673,
"learning_rate": 3.625640091449758e-06,
"loss": 0.294,
"step": 5829
},
{
"epoch": 2.881967618341367,
"grad_norm": 0.14283836028388208,
"learning_rate": 3.622640317524957e-06,
"loss": 0.2733,
"step": 5830
},
{
"epoch": 2.882461994809047,
"grad_norm": 0.11968407706195008,
"learning_rate": 3.6196415105609616e-06,
"loss": 0.2757,
"step": 5831
},
{
"epoch": 2.882956371276727,
"grad_norm": 0.12048990189858204,
"learning_rate": 3.616643671012471e-06,
"loss": 0.2889,
"step": 5832
},
{
"epoch": 2.8834507477444076,
"grad_norm": 0.1210801355771658,
"learning_rate": 3.613646799334024e-06,
"loss": 0.3031,
"step": 5833
},
{
"epoch": 2.8839451242120875,
"grad_norm": 0.11930109807766733,
"learning_rate": 3.6106508959800136e-06,
"loss": 0.3153,
"step": 5834
},
{
"epoch": 2.8844395006797674,
"grad_norm": 0.11823126090243194,
"learning_rate": 3.6076559614047035e-06,
"loss": 0.2877,
"step": 5835
},
{
"epoch": 2.8849338771474478,
"grad_norm": 0.11992043322792441,
"learning_rate": 3.604661996062191e-06,
"loss": 0.2853,
"step": 5836
},
{
"epoch": 2.885428253615128,
"grad_norm": 0.11349002213165922,
"learning_rate": 3.6016690004064305e-06,
"loss": 0.2943,
"step": 5837
},
{
"epoch": 2.885922630082808,
"grad_norm": 0.11755967447378925,
"learning_rate": 3.5986769748912363e-06,
"loss": 0.3015,
"step": 5838
},
{
"epoch": 2.886417006550488,
"grad_norm": 0.11792663268835378,
"learning_rate": 3.5956859199702678e-06,
"loss": 0.2812,
"step": 5839
},
{
"epoch": 2.8869113830181683,
"grad_norm": 0.12939019748660072,
"learning_rate": 3.592695836097041e-06,
"loss": 0.2953,
"step": 5840
},
{
"epoch": 2.8874057594858487,
"grad_norm": 0.12951009160763904,
"learning_rate": 3.5897067237249307e-06,
"loss": 0.2718,
"step": 5841
},
{
"epoch": 2.8879001359535286,
"grad_norm": 0.12217329985297769,
"learning_rate": 3.586718583307153e-06,
"loss": 0.299,
"step": 5842
},
{
"epoch": 2.8883945124212085,
"grad_norm": 0.1131819814563794,
"learning_rate": 3.5837314152967773e-06,
"loss": 0.2743,
"step": 5843
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.11730974216171307,
"learning_rate": 3.5807452201467387e-06,
"loss": 0.2876,
"step": 5844
},
{
"epoch": 2.889383265356569,
"grad_norm": 0.11870125138559706,
"learning_rate": 3.577759998309809e-06,
"loss": 0.259,
"step": 5845
},
{
"epoch": 2.889877641824249,
"grad_norm": 0.11614513121609749,
"learning_rate": 3.5747757502386214e-06,
"loss": 0.3023,
"step": 5846
},
{
"epoch": 2.8903720182919295,
"grad_norm": 0.11818375835396729,
"learning_rate": 3.5717924763856648e-06,
"loss": 0.3017,
"step": 5847
},
{
"epoch": 2.8908663947596094,
"grad_norm": 0.11795541920479484,
"learning_rate": 3.568810177203268e-06,
"loss": 0.2985,
"step": 5848
},
{
"epoch": 2.8913607712272897,
"grad_norm": 0.12757285216646996,
"learning_rate": 3.5658288531436248e-06,
"loss": 0.2822,
"step": 5849
},
{
"epoch": 2.8918551476949697,
"grad_norm": 0.11643680557576133,
"learning_rate": 3.562848504658769e-06,
"loss": 0.2957,
"step": 5850
},
{
"epoch": 2.89234952416265,
"grad_norm": 0.11567326208594018,
"learning_rate": 3.5598691322006005e-06,
"loss": 0.2808,
"step": 5851
},
{
"epoch": 2.89284390063033,
"grad_norm": 0.11844787149447472,
"learning_rate": 3.556890736220857e-06,
"loss": 0.291,
"step": 5852
},
{
"epoch": 2.8933382770980103,
"grad_norm": 0.12212626953456064,
"learning_rate": 3.5539133171711416e-06,
"loss": 0.2896,
"step": 5853
},
{
"epoch": 2.89383265356569,
"grad_norm": 0.11873992627301888,
"learning_rate": 3.550936875502894e-06,
"loss": 0.3131,
"step": 5854
},
{
"epoch": 2.8943270300333706,
"grad_norm": 0.1213415048887557,
"learning_rate": 3.547961411667423e-06,
"loss": 0.2901,
"step": 5855
},
{
"epoch": 2.8948214065010505,
"grad_norm": 0.16948416684384487,
"learning_rate": 3.544986926115872e-06,
"loss": 0.2958,
"step": 5856
},
{
"epoch": 2.895315782968731,
"grad_norm": 0.12084176203728145,
"learning_rate": 3.5420134192992493e-06,
"loss": 0.2912,
"step": 5857
},
{
"epoch": 2.8958101594364107,
"grad_norm": 0.1180832916391241,
"learning_rate": 3.539040891668413e-06,
"loss": 0.3023,
"step": 5858
},
{
"epoch": 2.896304535904091,
"grad_norm": 0.11801719840091676,
"learning_rate": 3.5360693436740656e-06,
"loss": 0.2847,
"step": 5859
},
{
"epoch": 2.896798912371771,
"grad_norm": 0.11823648739987769,
"learning_rate": 3.5330987757667613e-06,
"loss": 0.2841,
"step": 5860
},
{
"epoch": 2.8972932888394514,
"grad_norm": 0.11590602148190059,
"learning_rate": 3.5301291883969136e-06,
"loss": 0.2873,
"step": 5861
},
{
"epoch": 2.8977876653071313,
"grad_norm": 0.1297793209450423,
"learning_rate": 3.527160582014787e-06,
"loss": 0.3226,
"step": 5862
},
{
"epoch": 2.8982820417748116,
"grad_norm": 0.12286694107032975,
"learning_rate": 3.524192957070487e-06,
"loss": 0.28,
"step": 5863
},
{
"epoch": 2.8987764182424915,
"grad_norm": 0.11586664344686289,
"learning_rate": 3.5212263140139813e-06,
"loss": 0.2798,
"step": 5864
},
{
"epoch": 2.899270794710172,
"grad_norm": 0.11757368203218109,
"learning_rate": 3.5182606532950836e-06,
"loss": 0.3052,
"step": 5865
},
{
"epoch": 2.899765171177852,
"grad_norm": 0.11641118259035929,
"learning_rate": 3.515295975363454e-06,
"loss": 0.2975,
"step": 5866
},
{
"epoch": 2.900259547645532,
"grad_norm": 0.11593441988360763,
"learning_rate": 3.5123322806686135e-06,
"loss": 0.279,
"step": 5867
},
{
"epoch": 2.900753924113212,
"grad_norm": 0.49099649928854155,
"learning_rate": 3.5093695696599304e-06,
"loss": 0.3244,
"step": 5868
},
{
"epoch": 2.9012483005808924,
"grad_norm": 0.1205271584482621,
"learning_rate": 3.506407842786619e-06,
"loss": 0.2988,
"step": 5869
},
{
"epoch": 2.9017426770485724,
"grad_norm": 0.12090574608041962,
"learning_rate": 3.5034471004977534e-06,
"loss": 0.2841,
"step": 5870
},
{
"epoch": 2.9022370535162527,
"grad_norm": 0.11902124662652194,
"learning_rate": 3.500487343242247e-06,
"loss": 0.3014,
"step": 5871
},
{
"epoch": 2.9027314299839326,
"grad_norm": 0.11836386543311757,
"learning_rate": 3.4975285714688734e-06,
"loss": 0.303,
"step": 5872
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.12478074806020609,
"learning_rate": 3.4945707856262557e-06,
"loss": 0.2856,
"step": 5873
},
{
"epoch": 2.903720182919293,
"grad_norm": 0.12180637234921476,
"learning_rate": 3.4916139861628593e-06,
"loss": 0.2808,
"step": 5874
},
{
"epoch": 2.9042145593869733,
"grad_norm": 0.11715140191057435,
"learning_rate": 3.4886581735270133e-06,
"loss": 0.3052,
"step": 5875
},
{
"epoch": 2.904708935854653,
"grad_norm": 0.12430735881661427,
"learning_rate": 3.4857033481668856e-06,
"loss": 0.2552,
"step": 5876
},
{
"epoch": 2.9052033123223335,
"grad_norm": 0.11665802982656293,
"learning_rate": 3.4827495105304967e-06,
"loss": 0.2779,
"step": 5877
},
{
"epoch": 2.9056976887900134,
"grad_norm": 0.12975983114473194,
"learning_rate": 3.4797966610657198e-06,
"loss": 0.2904,
"step": 5878
},
{
"epoch": 2.906192065257694,
"grad_norm": 0.11986846705207371,
"learning_rate": 3.476844800220284e-06,
"loss": 0.2765,
"step": 5879
},
{
"epoch": 2.9066864417253737,
"grad_norm": 0.1143399484742762,
"learning_rate": 3.473893928441754e-06,
"loss": 0.2985,
"step": 5880
},
{
"epoch": 2.907180818193054,
"grad_norm": 0.12460007083806347,
"learning_rate": 3.47094404617756e-06,
"loss": 0.2783,
"step": 5881
},
{
"epoch": 2.907675194660734,
"grad_norm": 0.12291795609601669,
"learning_rate": 3.4679951538749712e-06,
"loss": 0.3023,
"step": 5882
},
{
"epoch": 2.9081695711284143,
"grad_norm": 0.11704924528772553,
"learning_rate": 3.465047251981104e-06,
"loss": 0.2859,
"step": 5883
},
{
"epoch": 2.9086639475960947,
"grad_norm": 0.11810619072959141,
"learning_rate": 3.4621003409429453e-06,
"loss": 0.2981,
"step": 5884
},
{
"epoch": 2.9091583240637746,
"grad_norm": 0.12220539981091813,
"learning_rate": 3.459154421207309e-06,
"loss": 0.3078,
"step": 5885
},
{
"epoch": 2.9096527005314545,
"grad_norm": 0.11784890847626628,
"learning_rate": 3.456209493220867e-06,
"loss": 0.3055,
"step": 5886
},
{
"epoch": 2.910147076999135,
"grad_norm": 0.11607378558146156,
"learning_rate": 3.4532655574301444e-06,
"loss": 0.2916,
"step": 5887
},
{
"epoch": 2.9106414534668152,
"grad_norm": 0.12178033998937812,
"learning_rate": 3.450322614281507e-06,
"loss": 0.307,
"step": 5888
},
{
"epoch": 2.911135829934495,
"grad_norm": 0.13160324715657595,
"learning_rate": 3.4473806642211793e-06,
"loss": 0.2685,
"step": 5889
},
{
"epoch": 2.911630206402175,
"grad_norm": 0.13011023294990082,
"learning_rate": 3.444439707695235e-06,
"loss": 0.3008,
"step": 5890
},
{
"epoch": 2.9121245828698554,
"grad_norm": 0.12087589536786227,
"learning_rate": 3.44149974514959e-06,
"loss": 0.2934,
"step": 5891
},
{
"epoch": 2.912618959337536,
"grad_norm": 0.11890414784003132,
"learning_rate": 3.43856077703001e-06,
"loss": 0.2861,
"step": 5892
},
{
"epoch": 2.9131133358052157,
"grad_norm": 0.11567086485769262,
"learning_rate": 3.4356228037821206e-06,
"loss": 0.2997,
"step": 5893
},
{
"epoch": 2.9136077122728956,
"grad_norm": 0.12053304513508062,
"learning_rate": 3.4326858258513807e-06,
"loss": 0.2805,
"step": 5894
},
{
"epoch": 2.914102088740576,
"grad_norm": 0.11640726803694539,
"learning_rate": 3.4297498436831113e-06,
"loss": 0.2915,
"step": 5895
},
{
"epoch": 2.9145964652082563,
"grad_norm": 0.12326954281663392,
"learning_rate": 3.42681485772248e-06,
"loss": 0.3067,
"step": 5896
},
{
"epoch": 2.9150908416759362,
"grad_norm": 0.12165931408714141,
"learning_rate": 3.4238808684144964e-06,
"loss": 0.2966,
"step": 5897
},
{
"epoch": 2.915585218143616,
"grad_norm": 0.11689517635835917,
"learning_rate": 3.4209478762040284e-06,
"loss": 0.3191,
"step": 5898
},
{
"epoch": 2.9160795946112965,
"grad_norm": 0.12184056363393565,
"learning_rate": 3.418015881535781e-06,
"loss": 0.271,
"step": 5899
},
{
"epoch": 2.916573971078977,
"grad_norm": 0.11854318595875324,
"learning_rate": 3.4150848848543208e-06,
"loss": 0.2916,
"step": 5900
},
{
"epoch": 2.917068347546657,
"grad_norm": 0.12080834984365128,
"learning_rate": 3.4121548866040587e-06,
"loss": 0.3022,
"step": 5901
},
{
"epoch": 2.9175627240143367,
"grad_norm": 0.11724195768578431,
"learning_rate": 3.4092258872292494e-06,
"loss": 0.2828,
"step": 5902
},
{
"epoch": 2.918057100482017,
"grad_norm": 0.11280173626189469,
"learning_rate": 3.406297887173997e-06,
"loss": 0.2973,
"step": 5903
},
{
"epoch": 2.9185514769496974,
"grad_norm": 0.11817488760923386,
"learning_rate": 3.4033708868822635e-06,
"loss": 0.2814,
"step": 5904
},
{
"epoch": 2.9190458534173773,
"grad_norm": 0.1223920277284147,
"learning_rate": 3.4004448867978445e-06,
"loss": 0.3031,
"step": 5905
},
{
"epoch": 2.9195402298850572,
"grad_norm": 0.11951019611506856,
"learning_rate": 3.3975198873643964e-06,
"loss": 0.2823,
"step": 5906
},
{
"epoch": 2.9200346063527376,
"grad_norm": 0.12051985577085801,
"learning_rate": 3.3945958890254215e-06,
"loss": 0.2993,
"step": 5907
},
{
"epoch": 2.920528982820418,
"grad_norm": 0.12085185037094018,
"learning_rate": 3.391672892224266e-06,
"loss": 0.2941,
"step": 5908
},
{
"epoch": 2.921023359288098,
"grad_norm": 0.11751293207204076,
"learning_rate": 3.3887508974041217e-06,
"loss": 0.3076,
"step": 5909
},
{
"epoch": 2.921517735755778,
"grad_norm": 0.17807313620776308,
"learning_rate": 3.3858299050080377e-06,
"loss": 0.3161,
"step": 5910
},
{
"epoch": 2.922012112223458,
"grad_norm": 0.11447347378695802,
"learning_rate": 3.382909915478909e-06,
"loss": 0.3149,
"step": 5911
},
{
"epoch": 2.9225064886911385,
"grad_norm": 0.12165545669655486,
"learning_rate": 3.37999092925947e-06,
"loss": 0.2862,
"step": 5912
},
{
"epoch": 2.9230008651588184,
"grad_norm": 0.12318300284691129,
"learning_rate": 3.3770729467923156e-06,
"loss": 0.2982,
"step": 5913
},
{
"epoch": 2.9234952416264983,
"grad_norm": 0.12244222418002663,
"learning_rate": 3.3741559685198798e-06,
"loss": 0.2997,
"step": 5914
},
{
"epoch": 2.9239896180941787,
"grad_norm": 0.11881922309468879,
"learning_rate": 3.371239994884441e-06,
"loss": 0.2831,
"step": 5915
},
{
"epoch": 2.924483994561859,
"grad_norm": 0.12614205935735806,
"learning_rate": 3.3683250263281354e-06,
"loss": 0.2809,
"step": 5916
},
{
"epoch": 2.924978371029539,
"grad_norm": 0.12178779927468336,
"learning_rate": 3.365411063292945e-06,
"loss": 0.2702,
"step": 5917
},
{
"epoch": 2.925472747497219,
"grad_norm": 0.11353999582199895,
"learning_rate": 3.3624981062206907e-06,
"loss": 0.2678,
"step": 5918
},
{
"epoch": 2.925967123964899,
"grad_norm": 0.12606873058785317,
"learning_rate": 3.359586155553053e-06,
"loss": 0.2975,
"step": 5919
},
{
"epoch": 2.9264615004325796,
"grad_norm": 0.11433410969448245,
"learning_rate": 3.356675211731546e-06,
"loss": 0.2909,
"step": 5920
},
{
"epoch": 2.9269558769002595,
"grad_norm": 0.11478789846384109,
"learning_rate": 3.3537652751975424e-06,
"loss": 0.2823,
"step": 5921
},
{
"epoch": 2.92745025336794,
"grad_norm": 0.11411558762875706,
"learning_rate": 3.350856346392263e-06,
"loss": 0.292,
"step": 5922
},
{
"epoch": 2.9279446298356198,
"grad_norm": 0.11996194169262851,
"learning_rate": 3.347948425756764e-06,
"loss": 0.301,
"step": 5923
},
{
"epoch": 2.9284390063033,
"grad_norm": 0.11663283420394839,
"learning_rate": 3.3450415137319613e-06,
"loss": 0.2909,
"step": 5924
},
{
"epoch": 2.92893338277098,
"grad_norm": 0.11984405368603056,
"learning_rate": 3.34213561075861e-06,
"loss": 0.3106,
"step": 5925
},
{
"epoch": 2.9294277592386604,
"grad_norm": 0.12376463094708748,
"learning_rate": 3.339230717277313e-06,
"loss": 0.2866,
"step": 5926
},
{
"epoch": 2.9299221357063403,
"grad_norm": 0.11118622831873622,
"learning_rate": 3.3363268337285224e-06,
"loss": 0.2999,
"step": 5927
},
{
"epoch": 2.9304165121740207,
"grad_norm": 0.11789723457196177,
"learning_rate": 3.333423960552542e-06,
"loss": 0.3018,
"step": 5928
},
{
"epoch": 2.9309108886417006,
"grad_norm": 0.11716646466078481,
"learning_rate": 3.3305220981895105e-06,
"loss": 0.3018,
"step": 5929
},
{
"epoch": 2.931405265109381,
"grad_norm": 0.118090921954278,
"learning_rate": 3.3276212470794244e-06,
"loss": 0.296,
"step": 5930
},
{
"epoch": 2.931899641577061,
"grad_norm": 0.11909701126176128,
"learning_rate": 3.3247214076621214e-06,
"loss": 0.2806,
"step": 5931
},
{
"epoch": 2.932394018044741,
"grad_norm": 0.11179014169246433,
"learning_rate": 3.3218225803772798e-06,
"loss": 0.2923,
"step": 5932
},
{
"epoch": 2.932888394512421,
"grad_norm": 0.11976013429489887,
"learning_rate": 3.318924765664443e-06,
"loss": 0.3422,
"step": 5933
},
{
"epoch": 2.9333827709801015,
"grad_norm": 0.11896166415184939,
"learning_rate": 3.3160279639629833e-06,
"loss": 0.2685,
"step": 5934
},
{
"epoch": 2.9338771474477814,
"grad_norm": 0.11601241133434015,
"learning_rate": 3.313132175712124e-06,
"loss": 0.2788,
"step": 5935
},
{
"epoch": 2.9343715239154617,
"grad_norm": 0.11875769775004415,
"learning_rate": 3.31023740135094e-06,
"loss": 0.3298,
"step": 5936
},
{
"epoch": 2.9348659003831417,
"grad_norm": 0.12087282804873056,
"learning_rate": 3.3073436413183437e-06,
"loss": 0.2893,
"step": 5937
},
{
"epoch": 2.935360276850822,
"grad_norm": 0.11701040188705171,
"learning_rate": 3.304450896053101e-06,
"loss": 0.3024,
"step": 5938
},
{
"epoch": 2.935854653318502,
"grad_norm": 0.1130234415708619,
"learning_rate": 3.301559165993825e-06,
"loss": 0.2723,
"step": 5939
},
{
"epoch": 2.9363490297861823,
"grad_norm": 0.12103810726859704,
"learning_rate": 3.298668451578969e-06,
"loss": 0.282,
"step": 5940
},
{
"epoch": 2.936843406253862,
"grad_norm": 0.11993819319036315,
"learning_rate": 3.29577875324683e-06,
"loss": 0.3097,
"step": 5941
},
{
"epoch": 2.9373377827215426,
"grad_norm": 0.11903925524813193,
"learning_rate": 3.292890071435563e-06,
"loss": 0.2975,
"step": 5942
},
{
"epoch": 2.9378321591892225,
"grad_norm": 0.12012498862799505,
"learning_rate": 3.290002406583155e-06,
"loss": 0.3022,
"step": 5943
},
{
"epoch": 2.938326535656903,
"grad_norm": 0.11811107084686527,
"learning_rate": 3.2871157591274483e-06,
"loss": 0.2804,
"step": 5944
},
{
"epoch": 2.9388209121245827,
"grad_norm": 0.11680154762264805,
"learning_rate": 3.2842301295061307e-06,
"loss": 0.3107,
"step": 5945
},
{
"epoch": 2.939315288592263,
"grad_norm": 0.11871172923570392,
"learning_rate": 3.2813455181567278e-06,
"loss": 0.2865,
"step": 5946
},
{
"epoch": 2.939809665059943,
"grad_norm": 0.12200841128395878,
"learning_rate": 3.278461925516622e-06,
"loss": 0.2938,
"step": 5947
},
{
"epoch": 2.9403040415276234,
"grad_norm": 0.1178262721679058,
"learning_rate": 3.2755793520230305e-06,
"loss": 0.2841,
"step": 5948
},
{
"epoch": 2.9407984179953033,
"grad_norm": 0.11812977675753994,
"learning_rate": 3.272697798113016e-06,
"loss": 0.2868,
"step": 5949
},
{
"epoch": 2.9412927944629836,
"grad_norm": 0.1209454198589376,
"learning_rate": 3.2698172642235027e-06,
"loss": 0.2821,
"step": 5950
},
{
"epoch": 2.9417871709306636,
"grad_norm": 0.11571495389127645,
"learning_rate": 3.2669377507912435e-06,
"loss": 0.2868,
"step": 5951
},
{
"epoch": 2.942281547398344,
"grad_norm": 0.11796680056178775,
"learning_rate": 3.2640592582528372e-06,
"loss": 0.2963,
"step": 5952
},
{
"epoch": 2.942775923866024,
"grad_norm": 0.12266056535917351,
"learning_rate": 3.2611817870447406e-06,
"loss": 0.2795,
"step": 5953
},
{
"epoch": 2.943270300333704,
"grad_norm": 0.12072409020788055,
"learning_rate": 3.258305337603239e-06,
"loss": 0.2841,
"step": 5954
},
{
"epoch": 2.943764676801384,
"grad_norm": 0.11894837735033736,
"learning_rate": 3.255429910364475e-06,
"loss": 0.2736,
"step": 5955
},
{
"epoch": 2.9442590532690645,
"grad_norm": 0.11492593052390632,
"learning_rate": 3.2525555057644365e-06,
"loss": 0.2853,
"step": 5956
},
{
"epoch": 2.9447534297367444,
"grad_norm": 0.1173290334799458,
"learning_rate": 3.2496821242389488e-06,
"loss": 0.2889,
"step": 5957
},
{
"epoch": 2.9452478062044247,
"grad_norm": 0.11976167945889474,
"learning_rate": 3.246809766223682e-06,
"loss": 0.2905,
"step": 5958
},
{
"epoch": 2.945742182672105,
"grad_norm": 0.11903664397709157,
"learning_rate": 3.2439384321541567e-06,
"loss": 0.2892,
"step": 5959
},
{
"epoch": 2.946236559139785,
"grad_norm": 0.12092718441218143,
"learning_rate": 3.2410681224657415e-06,
"loss": 0.3004,
"step": 5960
},
{
"epoch": 2.946730935607465,
"grad_norm": 0.11859722758945578,
"learning_rate": 3.238198837593636e-06,
"loss": 0.2903,
"step": 5961
},
{
"epoch": 2.9472253120751453,
"grad_norm": 0.1234406363658021,
"learning_rate": 3.2353305779728983e-06,
"loss": 0.2851,
"step": 5962
},
{
"epoch": 2.9477196885428256,
"grad_norm": 0.11671055758241607,
"learning_rate": 3.2324633440384222e-06,
"loss": 0.29,
"step": 5963
},
{
"epoch": 2.9482140650105055,
"grad_norm": 0.12194164682328971,
"learning_rate": 3.229597136224952e-06,
"loss": 0.2873,
"step": 5964
},
{
"epoch": 2.9487084414781854,
"grad_norm": 0.12568282292115207,
"learning_rate": 3.2267319549670707e-06,
"loss": 0.2884,
"step": 5965
},
{
"epoch": 2.949202817945866,
"grad_norm": 0.12367909065758019,
"learning_rate": 3.223867800699213e-06,
"loss": 0.3106,
"step": 5966
},
{
"epoch": 2.949697194413546,
"grad_norm": 0.12201599676058329,
"learning_rate": 3.2210046738556465e-06,
"loss": 0.279,
"step": 5967
},
{
"epoch": 2.950191570881226,
"grad_norm": 0.12172143207959245,
"learning_rate": 3.2181425748704977e-06,
"loss": 0.2918,
"step": 5968
},
{
"epoch": 2.950685947348906,
"grad_norm": 0.11877381955207082,
"learning_rate": 3.2152815041777217e-06,
"loss": 0.3026,
"step": 5969
},
{
"epoch": 2.9511803238165863,
"grad_norm": 0.11978853061297393,
"learning_rate": 3.2124214622111294e-06,
"loss": 0.2838,
"step": 5970
},
{
"epoch": 2.9516747002842667,
"grad_norm": 0.1253913330191872,
"learning_rate": 3.2095624494043763e-06,
"loss": 0.2947,
"step": 5971
},
{
"epoch": 2.9521690767519466,
"grad_norm": 0.11567078418672638,
"learning_rate": 3.2067044661909484e-06,
"loss": 0.2868,
"step": 5972
},
{
"epoch": 2.9526634532196265,
"grad_norm": 0.11837884191352539,
"learning_rate": 3.2038475130041937e-06,
"loss": 0.2919,
"step": 5973
},
{
"epoch": 2.953157829687307,
"grad_norm": 0.1127493762317042,
"learning_rate": 3.200991590277289e-06,
"loss": 0.2684,
"step": 5974
},
{
"epoch": 2.9536522061549872,
"grad_norm": 0.11640333281658702,
"learning_rate": 3.1981366984432594e-06,
"loss": 0.2906,
"step": 5975
},
{
"epoch": 2.954146582622667,
"grad_norm": 0.11549221039368626,
"learning_rate": 3.1952828379349774e-06,
"loss": 0.2881,
"step": 5976
},
{
"epoch": 2.954640959090347,
"grad_norm": 0.12179093946387856,
"learning_rate": 3.192430009185161e-06,
"loss": 0.2866,
"step": 5977
},
{
"epoch": 2.9551353355580274,
"grad_norm": 0.12754059161854395,
"learning_rate": 3.1895782126263598e-06,
"loss": 0.2905,
"step": 5978
},
{
"epoch": 2.955629712025708,
"grad_norm": 0.11871400442696367,
"learning_rate": 3.1867274486909828e-06,
"loss": 0.3155,
"step": 5979
},
{
"epoch": 2.9561240884933877,
"grad_norm": 0.1169454773474779,
"learning_rate": 3.183877717811268e-06,
"loss": 0.2784,
"step": 5980
},
{
"epoch": 2.9566184649610676,
"grad_norm": 0.1150488202252669,
"learning_rate": 3.1810290204192995e-06,
"loss": 0.2921,
"step": 5981
},
{
"epoch": 2.957112841428748,
"grad_norm": 0.11889553140010115,
"learning_rate": 3.178181356947019e-06,
"loss": 0.2798,
"step": 5982
},
{
"epoch": 2.9576072178964283,
"grad_norm": 0.11424278008470472,
"learning_rate": 3.1753347278261957e-06,
"loss": 0.28,
"step": 5983
},
{
"epoch": 2.9581015943641082,
"grad_norm": 0.11860430264405164,
"learning_rate": 3.1724891334884432e-06,
"loss": 0.2781,
"step": 5984
},
{
"epoch": 2.958595970831788,
"grad_norm": 0.12217919620042843,
"learning_rate": 3.169644574365228e-06,
"loss": 0.2865,
"step": 5985
},
{
"epoch": 2.9590903472994685,
"grad_norm": 0.11620659673408672,
"learning_rate": 3.166801050887849e-06,
"loss": 0.263,
"step": 5986
},
{
"epoch": 2.959584723767149,
"grad_norm": 0.11421251707663392,
"learning_rate": 3.1639585634874525e-06,
"loss": 0.2938,
"step": 5987
},
{
"epoch": 2.960079100234829,
"grad_norm": 0.11791180525381352,
"learning_rate": 3.1611171125950325e-06,
"loss": 0.2807,
"step": 5988
},
{
"epoch": 2.9605734767025087,
"grad_norm": 0.1129191992378997,
"learning_rate": 3.158276698641416e-06,
"loss": 0.2799,
"step": 5989
},
{
"epoch": 2.961067853170189,
"grad_norm": 0.11579228108563615,
"learning_rate": 3.155437322057283e-06,
"loss": 0.2894,
"step": 5990
},
{
"epoch": 2.9615622296378694,
"grad_norm": 0.11547677079381304,
"learning_rate": 3.1525989832731486e-06,
"loss": 0.2948,
"step": 5991
},
{
"epoch": 2.9620566061055493,
"grad_norm": 0.12399778229043547,
"learning_rate": 3.149761682719369e-06,
"loss": 0.3072,
"step": 5992
},
{
"epoch": 2.9625509825732292,
"grad_norm": 0.12324864928254022,
"learning_rate": 3.1469254208261512e-06,
"loss": 0.3001,
"step": 5993
},
{
"epoch": 2.9630453590409096,
"grad_norm": 0.14571136590477973,
"learning_rate": 3.144090198023544e-06,
"loss": 0.28,
"step": 5994
},
{
"epoch": 2.96353973550859,
"grad_norm": 0.12193101332000648,
"learning_rate": 3.141256014741427e-06,
"loss": 0.3017,
"step": 5995
},
{
"epoch": 2.96403411197627,
"grad_norm": 0.11937497249490713,
"learning_rate": 3.1384228714095387e-06,
"loss": 0.2685,
"step": 5996
},
{
"epoch": 2.9645284884439502,
"grad_norm": 0.1180927128772011,
"learning_rate": 3.1355907684574483e-06,
"loss": 0.296,
"step": 5997
},
{
"epoch": 2.96502286491163,
"grad_norm": 0.11511182191548776,
"learning_rate": 3.132759706314563e-06,
"loss": 0.2823,
"step": 5998
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.120656853204636,
"learning_rate": 3.1299296854101536e-06,
"loss": 0.3184,
"step": 5999
},
{
"epoch": 2.9660116178469904,
"grad_norm": 0.11193595085558933,
"learning_rate": 3.1271007061733126e-06,
"loss": 0.2926,
"step": 6000
},
{
"epoch": 2.9665059943146708,
"grad_norm": 0.12640623929958944,
"learning_rate": 3.1242727690329776e-06,
"loss": 0.3109,
"step": 6001
},
{
"epoch": 2.9670003707823507,
"grad_norm": 0.11850081431050172,
"learning_rate": 3.121445874417939e-06,
"loss": 0.2734,
"step": 6002
},
{
"epoch": 2.967494747250031,
"grad_norm": 0.11625606629352492,
"learning_rate": 3.1186200227568143e-06,
"loss": 0.3123,
"step": 6003
},
{
"epoch": 2.967989123717711,
"grad_norm": 0.1309927554540664,
"learning_rate": 3.1157952144780744e-06,
"loss": 0.2991,
"step": 6004
},
{
"epoch": 2.9684835001853913,
"grad_norm": 0.12448993435507688,
"learning_rate": 3.1129714500100306e-06,
"loss": 0.2804,
"step": 6005
},
{
"epoch": 2.9689778766530712,
"grad_norm": 0.11819768911229728,
"learning_rate": 3.1101487297808307e-06,
"loss": 0.2894,
"step": 6006
},
{
"epoch": 2.9694722531207516,
"grad_norm": 0.11448922601786332,
"learning_rate": 3.107327054218464e-06,
"loss": 0.2836,
"step": 6007
},
{
"epoch": 2.9699666295884315,
"grad_norm": 0.11671800209379865,
"learning_rate": 3.1045064237507704e-06,
"loss": 0.3079,
"step": 6008
},
{
"epoch": 2.970461006056112,
"grad_norm": 0.1142207970715991,
"learning_rate": 3.101686838805419e-06,
"loss": 0.2731,
"step": 6009
},
{
"epoch": 2.9709553825237918,
"grad_norm": 0.11829012230515469,
"learning_rate": 3.0988682998099282e-06,
"loss": 0.2868,
"step": 6010
},
{
"epoch": 2.971449758991472,
"grad_norm": 0.12048846301656518,
"learning_rate": 3.096050807191662e-06,
"loss": 0.2847,
"step": 6011
},
{
"epoch": 2.971944135459152,
"grad_norm": 0.12361407734991041,
"learning_rate": 3.0932343613778105e-06,
"loss": 0.3005,
"step": 6012
},
{
"epoch": 2.9724385119268324,
"grad_norm": 0.11910469215027499,
"learning_rate": 3.090418962795424e-06,
"loss": 0.2914,
"step": 6013
},
{
"epoch": 2.9729328883945123,
"grad_norm": 0.11940844136655983,
"learning_rate": 3.0876046118713756e-06,
"loss": 0.2794,
"step": 6014
},
{
"epoch": 2.9734272648621927,
"grad_norm": 0.11735714992137068,
"learning_rate": 3.0847913090323954e-06,
"loss": 0.2915,
"step": 6015
},
{
"epoch": 2.9739216413298726,
"grad_norm": 0.1147832914822521,
"learning_rate": 3.081979054705042e-06,
"loss": 0.2823,
"step": 6016
},
{
"epoch": 2.974416017797553,
"grad_norm": 0.11201879086051067,
"learning_rate": 3.079167849315727e-06,
"loss": 0.2674,
"step": 6017
},
{
"epoch": 2.974910394265233,
"grad_norm": 0.12481304922241009,
"learning_rate": 3.0763576932906903e-06,
"loss": 0.3091,
"step": 6018
},
{
"epoch": 2.975404770732913,
"grad_norm": 0.11750284354020887,
"learning_rate": 3.0735485870560245e-06,
"loss": 0.2986,
"step": 6019
},
{
"epoch": 2.975899147200593,
"grad_norm": 0.12525402024524362,
"learning_rate": 3.0707405310376513e-06,
"loss": 0.3149,
"step": 6020
},
{
"epoch": 2.9763935236682735,
"grad_norm": 0.11450553344800632,
"learning_rate": 3.067933525661343e-06,
"loss": 0.2889,
"step": 6021
},
{
"epoch": 2.9768879001359534,
"grad_norm": 0.11406559870788327,
"learning_rate": 3.065127571352713e-06,
"loss": 0.2741,
"step": 6022
},
{
"epoch": 2.9773822766036337,
"grad_norm": 0.11706890334579995,
"learning_rate": 3.0623226685372065e-06,
"loss": 0.279,
"step": 6023
},
{
"epoch": 2.9778766530713137,
"grad_norm": 0.12295102978019389,
"learning_rate": 3.0595188176401126e-06,
"loss": 0.2897,
"step": 6024
},
{
"epoch": 2.978371029538994,
"grad_norm": 0.11902956823811965,
"learning_rate": 3.0567160190865643e-06,
"loss": 0.2914,
"step": 6025
},
{
"epoch": 2.978865406006674,
"grad_norm": 0.1138544670622633,
"learning_rate": 3.0539142733015358e-06,
"loss": 0.2785,
"step": 6026
},
{
"epoch": 2.9793597824743543,
"grad_norm": 0.12105137844477692,
"learning_rate": 3.051113580709835e-06,
"loss": 0.2775,
"step": 6027
},
{
"epoch": 2.979854158942034,
"grad_norm": 0.1191541420956097,
"learning_rate": 3.0483139417361175e-06,
"loss": 0.2748,
"step": 6028
},
{
"epoch": 2.9803485354097146,
"grad_norm": 0.11997143202471107,
"learning_rate": 3.045515356804876e-06,
"loss": 0.2796,
"step": 6029
},
{
"epoch": 2.9808429118773945,
"grad_norm": 0.11616658459991556,
"learning_rate": 3.0427178263404367e-06,
"loss": 0.287,
"step": 6030
},
{
"epoch": 2.981337288345075,
"grad_norm": 0.11714110967532297,
"learning_rate": 3.0399213507669765e-06,
"loss": 0.2924,
"step": 6031
},
{
"epoch": 2.9818316648127547,
"grad_norm": 0.11286988464007089,
"learning_rate": 3.037125930508513e-06,
"loss": 0.282,
"step": 6032
},
{
"epoch": 2.982326041280435,
"grad_norm": 0.12847423650541548,
"learning_rate": 3.034331565988892e-06,
"loss": 0.297,
"step": 6033
},
{
"epoch": 2.9828204177481155,
"grad_norm": 0.12112573844412712,
"learning_rate": 3.031538257631811e-06,
"loss": 0.2969,
"step": 6034
},
{
"epoch": 2.9833147942157954,
"grad_norm": 0.11886659371824598,
"learning_rate": 3.0287460058607975e-06,
"loss": 0.3267,
"step": 6035
},
{
"epoch": 2.9838091706834753,
"grad_norm": 0.11954757952480698,
"learning_rate": 3.0259548110992265e-06,
"loss": 0.2863,
"step": 6036
},
{
"epoch": 2.9843035471511556,
"grad_norm": 0.12161796916577296,
"learning_rate": 3.023164673770315e-06,
"loss": 0.2958,
"step": 6037
},
{
"epoch": 2.984797923618836,
"grad_norm": 0.11873584537781173,
"learning_rate": 3.020375594297106e-06,
"loss": 0.2759,
"step": 6038
},
{
"epoch": 2.985292300086516,
"grad_norm": 0.12174370587841811,
"learning_rate": 3.0175875731024984e-06,
"loss": 0.2805,
"step": 6039
},
{
"epoch": 2.985786676554196,
"grad_norm": 0.1432375323573239,
"learning_rate": 3.014800610609221e-06,
"loss": 0.2985,
"step": 6040
},
{
"epoch": 2.986281053021876,
"grad_norm": 0.12018703394191521,
"learning_rate": 3.012014707239839e-06,
"loss": 0.2658,
"step": 6041
},
{
"epoch": 2.9867754294895565,
"grad_norm": 0.11572076962876383,
"learning_rate": 3.0092298634167672e-06,
"loss": 0.2917,
"step": 6042
},
{
"epoch": 2.9872698059572365,
"grad_norm": 0.1137003940817121,
"learning_rate": 3.0064460795622563e-06,
"loss": 0.2876,
"step": 6043
},
{
"epoch": 2.9877641824249164,
"grad_norm": 0.12700624462146348,
"learning_rate": 3.00366335609839e-06,
"loss": 0.3369,
"step": 6044
},
{
"epoch": 2.9882585588925967,
"grad_norm": 0.12691951887384587,
"learning_rate": 3.0008816934471007e-06,
"loss": 0.2682,
"step": 6045
},
{
"epoch": 2.988752935360277,
"grad_norm": 0.11538339264812172,
"learning_rate": 2.9981010920301547e-06,
"loss": 0.2933,
"step": 6046
},
{
"epoch": 2.989247311827957,
"grad_norm": 0.12251990829149438,
"learning_rate": 2.9953215522691483e-06,
"loss": 0.2936,
"step": 6047
},
{
"epoch": 2.989741688295637,
"grad_norm": 0.12063552429208023,
"learning_rate": 2.992543074585541e-06,
"loss": 0.2795,
"step": 6048
},
{
"epoch": 2.9902360647633173,
"grad_norm": 0.11547090036602249,
"learning_rate": 2.9897656594006095e-06,
"loss": 0.2856,
"step": 6049
},
{
"epoch": 2.9907304412309976,
"grad_norm": 0.12145190126772666,
"learning_rate": 2.986989307135475e-06,
"loss": 0.27,
"step": 6050
},
{
"epoch": 2.9912248176986775,
"grad_norm": 0.11498974456933593,
"learning_rate": 2.9842140182111035e-06,
"loss": 0.2835,
"step": 6051
},
{
"epoch": 2.9917191941663575,
"grad_norm": 0.11986943233898115,
"learning_rate": 2.981439793048291e-06,
"loss": 0.3149,
"step": 6052
},
{
"epoch": 2.992213570634038,
"grad_norm": 0.11793579051898712,
"learning_rate": 2.978666632067677e-06,
"loss": 0.2822,
"step": 6053
},
{
"epoch": 2.992707947101718,
"grad_norm": 0.11999752526135986,
"learning_rate": 2.975894535689746e-06,
"loss": 0.2956,
"step": 6054
},
{
"epoch": 2.993202323569398,
"grad_norm": 0.11848313050355043,
"learning_rate": 2.9731235043348093e-06,
"loss": 0.2973,
"step": 6055
},
{
"epoch": 2.993696700037078,
"grad_norm": 0.11613303617269553,
"learning_rate": 2.9703535384230173e-06,
"loss": 0.2809,
"step": 6056
},
{
"epoch": 2.9941910765047584,
"grad_norm": 0.11316589688519627,
"learning_rate": 2.9675846383743735e-06,
"loss": 0.2888,
"step": 6057
},
{
"epoch": 2.9946854529724387,
"grad_norm": 0.1200679731794305,
"learning_rate": 2.964816804608699e-06,
"loss": 0.3052,
"step": 6058
},
{
"epoch": 2.9951798294401186,
"grad_norm": 0.13208200157562194,
"learning_rate": 2.9620500375456695e-06,
"loss": 0.2971,
"step": 6059
},
{
"epoch": 2.9956742059077985,
"grad_norm": 0.12043036066413745,
"learning_rate": 2.959284337604795e-06,
"loss": 0.2925,
"step": 6060
},
{
"epoch": 2.996168582375479,
"grad_norm": 0.11556466979512438,
"learning_rate": 2.9565197052054184e-06,
"loss": 0.2665,
"step": 6061
},
{
"epoch": 2.9966629588431593,
"grad_norm": 0.11465225336942705,
"learning_rate": 2.953756140766727e-06,
"loss": 0.292,
"step": 6062
},
{
"epoch": 2.997157335310839,
"grad_norm": 0.11768413877814339,
"learning_rate": 2.9509936447077392e-06,
"loss": 0.2804,
"step": 6063
},
{
"epoch": 2.997651711778519,
"grad_norm": 0.11577966042782897,
"learning_rate": 2.94823221744732e-06,
"loss": 0.2767,
"step": 6064
},
{
"epoch": 2.9981460882461994,
"grad_norm": 0.11637800158980031,
"learning_rate": 2.9454718594041686e-06,
"loss": 0.2782,
"step": 6065
},
{
"epoch": 2.99864046471388,
"grad_norm": 0.1168106732617208,
"learning_rate": 2.9427125709968196e-06,
"loss": 0.2939,
"step": 6066
}
],
"logging_steps": 1,
"max_steps": 8088,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 2022,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.523129411839721e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}