|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0013449899125757, |
|
"eval_steps": 93, |
|
"global_step": 372, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0026899798251513113, |
|
"grad_norm": 1.8751003742218018, |
|
"learning_rate": 2e-05, |
|
"loss": 2.0835, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005379959650302623, |
|
"grad_norm": 2.948612928390503, |
|
"learning_rate": 4e-05, |
|
"loss": 2.7117, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008069939475453935, |
|
"grad_norm": 2.65693998336792, |
|
"learning_rate": 6e-05, |
|
"loss": 2.2257, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010759919300605245, |
|
"grad_norm": 3.0478532314300537, |
|
"learning_rate": 8e-05, |
|
"loss": 2.5187, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013449899125756557, |
|
"grad_norm": 5.911397457122803, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3052, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01613987895090787, |
|
"grad_norm": 6.985674858093262, |
|
"learning_rate": 0.00012, |
|
"loss": 2.2497, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01882985877605918, |
|
"grad_norm": 12.322802543640137, |
|
"learning_rate": 0.00014, |
|
"loss": 2.3409, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02151983860121049, |
|
"grad_norm": 5.827511787414551, |
|
"learning_rate": 0.00016, |
|
"loss": 2.4239, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0242098184263618, |
|
"grad_norm": 4.404603481292725, |
|
"learning_rate": 0.00018, |
|
"loss": 2.2837, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.026899798251513115, |
|
"grad_norm": 5.033199310302734, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9387, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029589778076664425, |
|
"grad_norm": 5.6907196044921875, |
|
"learning_rate": 0.00019999623426388962, |
|
"loss": 1.7416, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03227975790181574, |
|
"grad_norm": 6.009058952331543, |
|
"learning_rate": 0.00019998493733917384, |
|
"loss": 1.1876, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03496973772696705, |
|
"grad_norm": 7.73223352432251, |
|
"learning_rate": 0.00019996611007667742, |
|
"loss": 1.6472, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03765971755211836, |
|
"grad_norm": 9.206978797912598, |
|
"learning_rate": 0.00019993975389437038, |
|
"loss": 1.3337, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04034969737726967, |
|
"grad_norm": 8.522892951965332, |
|
"learning_rate": 0.00019990587077726128, |
|
"loss": 1.107, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04303967720242098, |
|
"grad_norm": 6.441080093383789, |
|
"learning_rate": 0.0001998644632772477, |
|
"loss": 1.3421, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04572965702757229, |
|
"grad_norm": 10.297110557556152, |
|
"learning_rate": 0.00019981553451292396, |
|
"loss": 1.542, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0484196368527236, |
|
"grad_norm": 8.839176177978516, |
|
"learning_rate": 0.0001997590881693464, |
|
"loss": 1.3331, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05110961667787491, |
|
"grad_norm": 7.464210510253906, |
|
"learning_rate": 0.00019969512849775565, |
|
"loss": 0.6544, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05379959650302623, |
|
"grad_norm": 8.324406623840332, |
|
"learning_rate": 0.00019962366031525664, |
|
"loss": 0.6081, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05648957632817754, |
|
"grad_norm": 7.356775760650635, |
|
"learning_rate": 0.00019954468900445566, |
|
"loss": 0.8796, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05917955615332885, |
|
"grad_norm": 4.948946475982666, |
|
"learning_rate": 0.00019945822051305507, |
|
"loss": 0.6637, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06186953597848016, |
|
"grad_norm": 4.626830577850342, |
|
"learning_rate": 0.00019936426135340528, |
|
"loss": 0.9787, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06455951580363148, |
|
"grad_norm": 5.476314067840576, |
|
"learning_rate": 0.0001992628186020143, |
|
"loss": 1.1509, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06724949562878278, |
|
"grad_norm": 4.901749134063721, |
|
"learning_rate": 0.00019915389989901474, |
|
"loss": 1.2591, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0699394754539341, |
|
"grad_norm": 5.085122108459473, |
|
"learning_rate": 0.00019903751344758848, |
|
"loss": 0.7551, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0726294552790854, |
|
"grad_norm": 7.4879231452941895, |
|
"learning_rate": 0.0001989136680133488, |
|
"loss": 0.8804, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07531943510423672, |
|
"grad_norm": 4.756115913391113, |
|
"learning_rate": 0.00019878237292368013, |
|
"loss": 0.7665, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07800941492938802, |
|
"grad_norm": 4.777318954467773, |
|
"learning_rate": 0.0001986436380670357, |
|
"loss": 1.0634, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08069939475453934, |
|
"grad_norm": 4.6226325035095215, |
|
"learning_rate": 0.00019849747389219272, |
|
"loss": 0.5563, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08338937457969066, |
|
"grad_norm": 4.621855735778809, |
|
"learning_rate": 0.0001983438914074654, |
|
"loss": 0.6377, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08607935440484196, |
|
"grad_norm": 4.654213905334473, |
|
"learning_rate": 0.00019818290217987587, |
|
"loss": 0.7768, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08876933422999328, |
|
"grad_norm": 4.971796989440918, |
|
"learning_rate": 0.00019801451833428312, |
|
"loss": 0.7793, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09145931405514458, |
|
"grad_norm": 4.303219318389893, |
|
"learning_rate": 0.00019783875255246973, |
|
"loss": 0.7334, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0941492938802959, |
|
"grad_norm": 4.168240547180176, |
|
"learning_rate": 0.0001976556180721867, |
|
"loss": 0.7872, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0968392737054472, |
|
"grad_norm": 3.7110626697540283, |
|
"learning_rate": 0.00019746512868615656, |
|
"loss": 0.9156, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09952925353059852, |
|
"grad_norm": 4.882535457611084, |
|
"learning_rate": 0.00019726729874103448, |
|
"loss": 1.1328, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10221923335574983, |
|
"grad_norm": 7.1945624351501465, |
|
"learning_rate": 0.00019706214313632784, |
|
"loss": 1.173, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10490921318090114, |
|
"grad_norm": 8.771895408630371, |
|
"learning_rate": 0.00019684967732327396, |
|
"loss": 0.9978, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10759919300605246, |
|
"grad_norm": 5.988046169281006, |
|
"learning_rate": 0.00019662991730367663, |
|
"loss": 0.7832, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11028917283120376, |
|
"grad_norm": 12.11080551147461, |
|
"learning_rate": 0.00019640287962870062, |
|
"loss": 1.593, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11297915265635508, |
|
"grad_norm": 8.772387504577637, |
|
"learning_rate": 0.00019616858139762534, |
|
"loss": 0.7483, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11566913248150638, |
|
"grad_norm": 8.320154190063477, |
|
"learning_rate": 0.000195927040256557, |
|
"loss": 0.7829, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1183591123066577, |
|
"grad_norm": 9.61361026763916, |
|
"learning_rate": 0.00019567827439709954, |
|
"loss": 1.014, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.121049092131809, |
|
"grad_norm": 5.479886531829834, |
|
"learning_rate": 0.00019542230255498454, |
|
"loss": 0.5207, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12373907195696032, |
|
"grad_norm": 18.779264450073242, |
|
"learning_rate": 0.0001951591440086602, |
|
"loss": 1.5535, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12642905178211164, |
|
"grad_norm": 11.512892723083496, |
|
"learning_rate": 0.00019488881857783935, |
|
"loss": 1.3748, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12911903160726296, |
|
"grad_norm": 16.13918685913086, |
|
"learning_rate": 0.00019461134662200668, |
|
"loss": 1.1999, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13180901143241425, |
|
"grad_norm": 11.946172714233398, |
|
"learning_rate": 0.00019432674903888548, |
|
"loss": 0.7801, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13449899125756556, |
|
"grad_norm": 10.913107872009277, |
|
"learning_rate": 0.0001940350472628637, |
|
"loss": 0.4945, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13718897108271688, |
|
"grad_norm": 7.555224895477295, |
|
"learning_rate": 0.00019373626326337946, |
|
"loss": 1.7617, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1398789509078682, |
|
"grad_norm": 8.530354499816895, |
|
"learning_rate": 0.0001934304195432668, |
|
"loss": 2.1801, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1425689307330195, |
|
"grad_norm": 6.264369487762451, |
|
"learning_rate": 0.0001931175391370605, |
|
"loss": 1.514, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1452589105581708, |
|
"grad_norm": 5.179037094116211, |
|
"learning_rate": 0.00019279764560926142, |
|
"loss": 1.746, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14794889038332212, |
|
"grad_norm": 4.9991536140441895, |
|
"learning_rate": 0.00019247076305256176, |
|
"loss": 1.4403, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15063887020847344, |
|
"grad_norm": 5.004469871520996, |
|
"learning_rate": 0.00019213691608603047, |
|
"loss": 2.0104, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15332885003362476, |
|
"grad_norm": 5.55224609375, |
|
"learning_rate": 0.00019179612985325908, |
|
"loss": 1.5966, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15601882985877605, |
|
"grad_norm": 4.276426315307617, |
|
"learning_rate": 0.00019144843002046806, |
|
"loss": 1.3126, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15870880968392737, |
|
"grad_norm": 6.414350509643555, |
|
"learning_rate": 0.0001910938427745737, |
|
"loss": 1.4917, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.16139878950907868, |
|
"grad_norm": 4.202977657318115, |
|
"learning_rate": 0.000190732394821216, |
|
"loss": 1.0693, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16408876933423, |
|
"grad_norm": 4.08539342880249, |
|
"learning_rate": 0.00019036411338274703, |
|
"loss": 1.0895, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16677874915938132, |
|
"grad_norm": 3.5492026805877686, |
|
"learning_rate": 0.00018998902619618116, |
|
"loss": 0.9263, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1694687289845326, |
|
"grad_norm": 3.8257992267608643, |
|
"learning_rate": 0.00018960716151110554, |
|
"loss": 1.3858, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17215870880968392, |
|
"grad_norm": 3.3236279487609863, |
|
"learning_rate": 0.00018921854808755294, |
|
"loss": 1.0358, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17484868863483524, |
|
"grad_norm": 3.356065511703491, |
|
"learning_rate": 0.00018882321519383534, |
|
"loss": 0.8704, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17753866845998656, |
|
"grad_norm": 4.296020984649658, |
|
"learning_rate": 0.00018842119260433982, |
|
"loss": 1.0503, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18022864828513785, |
|
"grad_norm": 5.693877696990967, |
|
"learning_rate": 0.00018801251059728604, |
|
"loss": 0.6922, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18291862811028917, |
|
"grad_norm": 5.999386310577393, |
|
"learning_rate": 0.0001875971999524458, |
|
"loss": 0.824, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18560860793544048, |
|
"grad_norm": 4.900099754333496, |
|
"learning_rate": 0.000187175291948825, |
|
"loss": 0.7182, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1882985877605918, |
|
"grad_norm": 4.141096115112305, |
|
"learning_rate": 0.0001867468183623077, |
|
"loss": 0.4006, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19098856758574312, |
|
"grad_norm": 6.806368350982666, |
|
"learning_rate": 0.00018631181146326305, |
|
"loss": 0.8936, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1936785474108944, |
|
"grad_norm": 2.945824146270752, |
|
"learning_rate": 0.0001858703040141148, |
|
"loss": 0.5178, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19636852723604573, |
|
"grad_norm": 6.985172271728516, |
|
"learning_rate": 0.00018542232926687383, |
|
"loss": 0.8644, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19905850706119704, |
|
"grad_norm": 5.231998920440674, |
|
"learning_rate": 0.0001849679209606338, |
|
"loss": 1.0585, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.20174848688634836, |
|
"grad_norm": 4.8978705406188965, |
|
"learning_rate": 0.00018450711331903006, |
|
"loss": 0.7828, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20443846671149965, |
|
"grad_norm": 5.309878826141357, |
|
"learning_rate": 0.00018403994104766212, |
|
"loss": 0.7666, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20712844653665097, |
|
"grad_norm": 5.227763652801514, |
|
"learning_rate": 0.00018356643933147986, |
|
"loss": 0.8396, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20981842636180228, |
|
"grad_norm": 4.239619731903076, |
|
"learning_rate": 0.00018308664383213344, |
|
"loss": 0.6439, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2125084061869536, |
|
"grad_norm": 5.731531620025635, |
|
"learning_rate": 0.00018260059068528762, |
|
"loss": 1.3371, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21519838601210492, |
|
"grad_norm": 4.006597518920898, |
|
"learning_rate": 0.00018210831649790018, |
|
"loss": 0.5272, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2178883658372562, |
|
"grad_norm": 4.0596699714660645, |
|
"learning_rate": 0.00018160985834546475, |
|
"loss": 0.4416, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.22057834566240753, |
|
"grad_norm": 4.053659915924072, |
|
"learning_rate": 0.00018110525376921862, |
|
"loss": 0.4781, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22326832548755884, |
|
"grad_norm": 4.120569705963135, |
|
"learning_rate": 0.00018059454077331527, |
|
"loss": 0.8082, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22595830531271016, |
|
"grad_norm": 6.420701503753662, |
|
"learning_rate": 0.00018007775782196214, |
|
"loss": 0.5476, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22864828513786148, |
|
"grad_norm": 13.603157997131348, |
|
"learning_rate": 0.00017955494383652365, |
|
"loss": 0.7857, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23133826496301277, |
|
"grad_norm": 7.784971237182617, |
|
"learning_rate": 0.00017902613819258985, |
|
"loss": 0.7705, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23402824478816409, |
|
"grad_norm": 5.966789722442627, |
|
"learning_rate": 0.00017849138071701092, |
|
"loss": 0.9065, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2367182246133154, |
|
"grad_norm": 9.451849937438965, |
|
"learning_rate": 0.0001779507116848976, |
|
"loss": 0.998, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23940820443846672, |
|
"grad_norm": 6.712332725524902, |
|
"learning_rate": 0.00017740417181658788, |
|
"loss": 1.1464, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.242098184263618, |
|
"grad_norm": 4.188553333282471, |
|
"learning_rate": 0.00017685180227458003, |
|
"loss": 0.6356, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24478816408876933, |
|
"grad_norm": 9.703150749206543, |
|
"learning_rate": 0.00017629364466043273, |
|
"loss": 1.0548, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.24747814391392065, |
|
"grad_norm": 6.08168363571167, |
|
"learning_rate": 0.00017572974101163165, |
|
"loss": 0.7252, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.25016812373907193, |
|
"grad_norm": 10.167409896850586, |
|
"learning_rate": 0.00017516013379842337, |
|
"loss": 0.658, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.25016812373907193, |
|
"eval_loss": 0.9310864210128784, |
|
"eval_runtime": 10.8326, |
|
"eval_samples_per_second": 14.493, |
|
"eval_steps_per_second": 7.293, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2528581035642233, |
|
"grad_norm": 5.78995418548584, |
|
"learning_rate": 0.00017458486592061704, |
|
"loss": 0.9346, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.25554808338937457, |
|
"grad_norm": 3.517900228500366, |
|
"learning_rate": 0.00017400398070435293, |
|
"loss": 0.3506, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2582380632145259, |
|
"grad_norm": 5.804417610168457, |
|
"learning_rate": 0.00017341752189883983, |
|
"loss": 0.4959, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2609280430396772, |
|
"grad_norm": 8.148117065429688, |
|
"learning_rate": 0.00017282553367305975, |
|
"loss": 0.9842, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2636180228648285, |
|
"grad_norm": 9.511378288269043, |
|
"learning_rate": 0.0001722280606124415, |
|
"loss": 0.7143, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.26630800268997984, |
|
"grad_norm": 6.079991340637207, |
|
"learning_rate": 0.00017162514771550255, |
|
"loss": 0.2979, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.26899798251513113, |
|
"grad_norm": 6.114333152770996, |
|
"learning_rate": 0.00017101684039046036, |
|
"loss": 0.5812, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2716879623402825, |
|
"grad_norm": 4.91884183883667, |
|
"learning_rate": 0.0001704031844518121, |
|
"loss": 1.8317, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.27437794216543376, |
|
"grad_norm": 5.735188007354736, |
|
"learning_rate": 0.0001697842261168843, |
|
"loss": 2.3345, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.27706792199058505, |
|
"grad_norm": 5.317649841308594, |
|
"learning_rate": 0.0001691600120023521, |
|
"loss": 2.0851, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2797579018157364, |
|
"grad_norm": 7.778799057006836, |
|
"learning_rate": 0.00016853058912072802, |
|
"loss": 1.1674, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2824478816408877, |
|
"grad_norm": 4.196416854858398, |
|
"learning_rate": 0.00016789600487682156, |
|
"loss": 1.5939, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.285137861466039, |
|
"grad_norm": 4.3927741050720215, |
|
"learning_rate": 0.0001672563070641688, |
|
"loss": 1.4615, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2878278412911903, |
|
"grad_norm": 4.284142017364502, |
|
"learning_rate": 0.0001666115438614328, |
|
"loss": 1.9508, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2905178211163416, |
|
"grad_norm": 5.4508867263793945, |
|
"learning_rate": 0.00016596176382877506, |
|
"loss": 1.3256, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.29320780094149296, |
|
"grad_norm": 11.987678527832031, |
|
"learning_rate": 0.00016530701590419824, |
|
"loss": 0.9202, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.29589778076664425, |
|
"grad_norm": 5.667636394500732, |
|
"learning_rate": 0.00016464734939986036, |
|
"loss": 1.3247, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29858776059179554, |
|
"grad_norm": 3.8087687492370605, |
|
"learning_rate": 0.00016398281399836097, |
|
"loss": 0.9626, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3012777404169469, |
|
"grad_norm": 5.772204875946045, |
|
"learning_rate": 0.00016331345974899923, |
|
"loss": 1.3912, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.30396772024209817, |
|
"grad_norm": 3.2174160480499268, |
|
"learning_rate": 0.00016263933706400451, |
|
"loss": 1.0545, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3066577000672495, |
|
"grad_norm": 3.539743423461914, |
|
"learning_rate": 0.00016196049671473954, |
|
"loss": 0.9489, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3093476798924008, |
|
"grad_norm": 3.6935033798217773, |
|
"learning_rate": 0.0001612769898278766, |
|
"loss": 1.0005, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3120376597175521, |
|
"grad_norm": 3.477961301803589, |
|
"learning_rate": 0.00016058886788154712, |
|
"loss": 0.6155, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.31472763954270344, |
|
"grad_norm": 3.9399242401123047, |
|
"learning_rate": 0.00015989618270146423, |
|
"loss": 0.7689, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.31741761936785473, |
|
"grad_norm": 4.4496846199035645, |
|
"learning_rate": 0.0001591989864570199, |
|
"loss": 1.0174, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3201075991930061, |
|
"grad_norm": 4.519758224487305, |
|
"learning_rate": 0.00015849733165735556, |
|
"loss": 0.9051, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.32279757901815737, |
|
"grad_norm": 3.636235237121582, |
|
"learning_rate": 0.00015779127114740757, |
|
"loss": 0.5993, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32548755884330866, |
|
"grad_norm": 2.2947537899017334, |
|
"learning_rate": 0.0001570808581039271, |
|
"loss": 0.23, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.32817753866846, |
|
"grad_norm": 3.0490782260894775, |
|
"learning_rate": 0.00015636614603147512, |
|
"loss": 0.5818, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3308675184936113, |
|
"grad_norm": 3.2933220863342285, |
|
"learning_rate": 0.0001556471887583929, |
|
"loss": 0.6548, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.33355749831876264, |
|
"grad_norm": 4.488528251647949, |
|
"learning_rate": 0.0001549240404327477, |
|
"loss": 0.9628, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3362474781439139, |
|
"grad_norm": 4.679425239562988, |
|
"learning_rate": 0.00015419675551825475, |
|
"loss": 0.4106, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3389374579690652, |
|
"grad_norm": 4.400868892669678, |
|
"learning_rate": 0.0001534653887901754, |
|
"loss": 0.3852, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.34162743779421656, |
|
"grad_norm": 4.978918552398682, |
|
"learning_rate": 0.00015272999533119162, |
|
"loss": 0.8162, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.34431741761936785, |
|
"grad_norm": 5.046586990356445, |
|
"learning_rate": 0.00015199063052725745, |
|
"loss": 0.649, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34700739744451914, |
|
"grad_norm": 7.412467956542969, |
|
"learning_rate": 0.0001512473500634277, |
|
"loss": 0.6579, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3496973772696705, |
|
"grad_norm": 3.8262441158294678, |
|
"learning_rate": 0.00015050020991966406, |
|
"loss": 0.4359, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3523873570948218, |
|
"grad_norm": 5.179169654846191, |
|
"learning_rate": 0.0001497492663666189, |
|
"loss": 0.5676, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3550773369199731, |
|
"grad_norm": 5.74229097366333, |
|
"learning_rate": 0.00014899457596139729, |
|
"loss": 0.3635, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3577673167451244, |
|
"grad_norm": 7.098540782928467, |
|
"learning_rate": 0.00014823619554329745, |
|
"loss": 0.996, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3604572965702757, |
|
"grad_norm": 4.635382652282715, |
|
"learning_rate": 0.00014747418222952995, |
|
"loss": 0.7149, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.36314727639542704, |
|
"grad_norm": 3.750243663787842, |
|
"learning_rate": 0.0001467085934109158, |
|
"loss": 0.3169, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.36583725622057833, |
|
"grad_norm": 4.545015811920166, |
|
"learning_rate": 0.00014593948674756417, |
|
"loss": 0.5511, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3685272360457297, |
|
"grad_norm": 5.990297794342041, |
|
"learning_rate": 0.0001451669201645298, |
|
"loss": 0.766, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.37121721587088097, |
|
"grad_norm": 3.692354679107666, |
|
"learning_rate": 0.00014439095184745024, |
|
"loss": 0.4151, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.37390719569603226, |
|
"grad_norm": 3.4247729778289795, |
|
"learning_rate": 0.00014361164023816376, |
|
"loss": 0.466, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3765971755211836, |
|
"grad_norm": 3.962257146835327, |
|
"learning_rate": 0.00014282904403030772, |
|
"loss": 0.4263, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3792871553463349, |
|
"grad_norm": 5.7197771072387695, |
|
"learning_rate": 0.00014204322216489814, |
|
"loss": 0.4988, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.38197713517148624, |
|
"grad_norm": 5.587864398956299, |
|
"learning_rate": 0.00014125423382589048, |
|
"loss": 0.6946, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3846671149966375, |
|
"grad_norm": 11.981307029724121, |
|
"learning_rate": 0.00014046213843572236, |
|
"loss": 0.7456, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3873570948217888, |
|
"grad_norm": 6.747979164123535, |
|
"learning_rate": 0.00013966699565083802, |
|
"loss": 1.2804, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.39004707464694016, |
|
"grad_norm": 4.663575649261475, |
|
"learning_rate": 0.0001388688653571954, |
|
"loss": 0.5548, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.39273705447209145, |
|
"grad_norm": 5.274585247039795, |
|
"learning_rate": 0.00013806780766575588, |
|
"loss": 0.6681, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3954270342972428, |
|
"grad_norm": 13.038918495178223, |
|
"learning_rate": 0.00013726388290795697, |
|
"loss": 1.082, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3981170141223941, |
|
"grad_norm": 7.035642623901367, |
|
"learning_rate": 0.00013645715163116846, |
|
"loss": 0.3975, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4008069939475454, |
|
"grad_norm": 5.065128326416016, |
|
"learning_rate": 0.00013564767459413237, |
|
"loss": 0.2747, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4034969737726967, |
|
"grad_norm": 4.475830554962158, |
|
"learning_rate": 0.0001348355127623869, |
|
"loss": 0.2169, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.406186953597848, |
|
"grad_norm": 4.0652031898498535, |
|
"learning_rate": 0.00013402072730367475, |
|
"loss": 1.7546, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4088769334229993, |
|
"grad_norm": 4.62870454788208, |
|
"learning_rate": 0.0001332033795833364, |
|
"loss": 1.5081, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.41156691324815065, |
|
"grad_norm": 3.8758082389831543, |
|
"learning_rate": 0.0001323835311596884, |
|
"loss": 1.371, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.41425689307330194, |
|
"grad_norm": 4.078228950500488, |
|
"learning_rate": 0.00013156124377938699, |
|
"loss": 1.5507, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4169468728984533, |
|
"grad_norm": 3.6525630950927734, |
|
"learning_rate": 0.0001307365793727778, |
|
"loss": 1.1093, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.41963685272360457, |
|
"grad_norm": 4.3088202476501465, |
|
"learning_rate": 0.00012990960004923154, |
|
"loss": 1.6154, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.42232683254875586, |
|
"grad_norm": 4.335425853729248, |
|
"learning_rate": 0.00012908036809246623, |
|
"loss": 1.4037, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4250168123739072, |
|
"grad_norm": 3.7850985527038574, |
|
"learning_rate": 0.00012824894595585637, |
|
"loss": 1.1471, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4277067921990585, |
|
"grad_norm": 4.085525035858154, |
|
"learning_rate": 0.00012741539625772918, |
|
"loss": 1.2586, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.43039677202420984, |
|
"grad_norm": 3.4970481395721436, |
|
"learning_rate": 0.0001265797817766486, |
|
"loss": 1.1133, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.43308675184936113, |
|
"grad_norm": 4.015367031097412, |
|
"learning_rate": 0.0001257421654466872, |
|
"loss": 0.71, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4357767316745124, |
|
"grad_norm": 3.805530071258545, |
|
"learning_rate": 0.00012490261035268612, |
|
"loss": 1.4369, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.43846671149966376, |
|
"grad_norm": 4.442086696624756, |
|
"learning_rate": 0.00012406117972550414, |
|
"loss": 1.1577, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.44115669132481505, |
|
"grad_norm": 3.171997308731079, |
|
"learning_rate": 0.00012321793693725509, |
|
"loss": 0.667, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4438466711499664, |
|
"grad_norm": 4.185075759887695, |
|
"learning_rate": 0.0001223729454965354, |
|
"loss": 0.7278, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4465366509751177, |
|
"grad_norm": 3.8975086212158203, |
|
"learning_rate": 0.00012152626904364067, |
|
"loss": 0.9939, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.449226630800269, |
|
"grad_norm": 3.1474146842956543, |
|
"learning_rate": 0.00012067797134577275, |
|
"loss": 0.7392, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4519166106254203, |
|
"grad_norm": 3.5632522106170654, |
|
"learning_rate": 0.00011982811629223709, |
|
"loss": 0.8636, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4546065904505716, |
|
"grad_norm": 2.6525533199310303, |
|
"learning_rate": 0.00011897676788963101, |
|
"loss": 0.3818, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.45729657027572296, |
|
"grad_norm": 3.889469861984253, |
|
"learning_rate": 0.0001181239902570229, |
|
"loss": 0.5985, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45998655010087425, |
|
"grad_norm": 3.6286370754241943, |
|
"learning_rate": 0.00011726984762112328, |
|
"loss": 0.8639, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.46267652992602554, |
|
"grad_norm": 2.5282163619995117, |
|
"learning_rate": 0.0001164144043114475, |
|
"loss": 0.3303, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4653665097511769, |
|
"grad_norm": 4.00683069229126, |
|
"learning_rate": 0.00011555772475547084, |
|
"loss": 0.414, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.46805648957632817, |
|
"grad_norm": 5.255921363830566, |
|
"learning_rate": 0.00011469987347377602, |
|
"loss": 0.8622, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.47074646940147946, |
|
"grad_norm": 4.201918601989746, |
|
"learning_rate": 0.00011384091507519403, |
|
"loss": 0.8862, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4734364492266308, |
|
"grad_norm": 4.199880599975586, |
|
"learning_rate": 0.00011298091425193806, |
|
"loss": 0.4554, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4761264290517821, |
|
"grad_norm": 3.6669838428497314, |
|
"learning_rate": 0.00011211993577473121, |
|
"loss": 0.343, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.47881640887693344, |
|
"grad_norm": 4.186169147491455, |
|
"learning_rate": 0.00011125804448792831, |
|
"loss": 0.8039, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.48150638870208473, |
|
"grad_norm": 4.209519386291504, |
|
"learning_rate": 0.00011039530530463218, |
|
"loss": 0.3221, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.484196368527236, |
|
"grad_norm": 2.875875234603882, |
|
"learning_rate": 0.00010953178320180475, |
|
"loss": 0.2874, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.48688634835238737, |
|
"grad_norm": 4.24071741104126, |
|
"learning_rate": 0.00010866754321537338, |
|
"loss": 0.5502, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.48957632817753866, |
|
"grad_norm": 4.230165481567383, |
|
"learning_rate": 0.0001078026504353325, |
|
"loss": 0.5396, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.49226630800269, |
|
"grad_norm": 4.387772560119629, |
|
"learning_rate": 0.0001069371700008416, |
|
"loss": 0.5987, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4949562878278413, |
|
"grad_norm": 4.988356113433838, |
|
"learning_rate": 0.00010607116709531918, |
|
"loss": 0.6046, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4976462676529926, |
|
"grad_norm": 4.388515472412109, |
|
"learning_rate": 0.00010520470694153353, |
|
"loss": 0.595, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5003362474781439, |
|
"grad_norm": 4.310067653656006, |
|
"learning_rate": 0.00010433785479669038, |
|
"loss": 0.5557, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5003362474781439, |
|
"eval_loss": 0.8278390765190125, |
|
"eval_runtime": 10.698, |
|
"eval_samples_per_second": 14.676, |
|
"eval_steps_per_second": 7.385, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5030262273032953, |
|
"grad_norm": 4.27438497543335, |
|
"learning_rate": 0.0001034706759475182, |
|
"loss": 0.7565, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5057162071284466, |
|
"grad_norm": 6.716769218444824, |
|
"learning_rate": 0.0001026032357053512, |
|
"loss": 0.8371, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5084061869535978, |
|
"grad_norm": 2.8589985370635986, |
|
"learning_rate": 0.0001017355994012102, |
|
"loss": 0.2835, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5110961667787491, |
|
"grad_norm": 2.675102949142456, |
|
"learning_rate": 0.00010086783238088244, |
|
"loss": 0.2332, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5137861466039004, |
|
"grad_norm": 5.922438144683838, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3974, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5164761264290518, |
|
"grad_norm": 9.693557739257812, |
|
"learning_rate": 9.913216761911755e-05, |
|
"loss": 0.8956, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5191661062542031, |
|
"grad_norm": 5.175564765930176, |
|
"learning_rate": 9.826440059878982e-05, |
|
"loss": 0.4924, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5218560860793544, |
|
"grad_norm": 4.789131164550781, |
|
"learning_rate": 9.739676429464881e-05, |
|
"loss": 0.6214, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5245460659045057, |
|
"grad_norm": 4.584465026855469, |
|
"learning_rate": 9.652932405248181e-05, |
|
"loss": 0.6807, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.527236045729657, |
|
"grad_norm": 8.758447647094727, |
|
"learning_rate": 9.566214520330966e-05, |
|
"loss": 1.2434, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5299260255548084, |
|
"grad_norm": 8.869791984558105, |
|
"learning_rate": 9.479529305846652e-05, |
|
"loss": 0.903, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5326160053799597, |
|
"grad_norm": 3.716257333755493, |
|
"learning_rate": 9.392883290468083e-05, |
|
"loss": 0.3255, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.535305985205111, |
|
"grad_norm": 5.400476455688477, |
|
"learning_rate": 9.306282999915839e-05, |
|
"loss": 0.4036, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5379959650302623, |
|
"grad_norm": 3.5948078632354736, |
|
"learning_rate": 9.219734956466752e-05, |
|
"loss": 0.1489, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5406859448554135, |
|
"grad_norm": 3.1145272254943848, |
|
"learning_rate": 9.133245678462663e-05, |
|
"loss": 1.6383, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.543375924680565, |
|
"grad_norm": 3.916707754135132, |
|
"learning_rate": 9.046821679819527e-05, |
|
"loss": 1.7301, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5460659045057162, |
|
"grad_norm": 3.5734024047851562, |
|
"learning_rate": 8.960469469536786e-05, |
|
"loss": 1.1493, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5487558843308675, |
|
"grad_norm": 3.514202356338501, |
|
"learning_rate": 8.874195551207174e-05, |
|
"loss": 1.2045, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5514458641560188, |
|
"grad_norm": 3.9733996391296387, |
|
"learning_rate": 8.788006422526881e-05, |
|
"loss": 1.8557, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5541358439811701, |
|
"grad_norm": 4.543676853179932, |
|
"learning_rate": 8.701908574806197e-05, |
|
"loss": 1.3852, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5568258238063215, |
|
"grad_norm": 3.916227102279663, |
|
"learning_rate": 8.615908492480598e-05, |
|
"loss": 1.1968, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5595158036314728, |
|
"grad_norm": 4.097233295440674, |
|
"learning_rate": 8.530012652622397e-05, |
|
"loss": 1.2521, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5622057834566241, |
|
"grad_norm": 3.402857780456543, |
|
"learning_rate": 8.444227524452918e-05, |
|
"loss": 0.8308, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5648957632817754, |
|
"grad_norm": 4.228405952453613, |
|
"learning_rate": 8.358559568855249e-05, |
|
"loss": 1.1485, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5675857431069267, |
|
"grad_norm": 3.699495553970337, |
|
"learning_rate": 8.273015237887673e-05, |
|
"loss": 0.8204, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.570275722932078, |
|
"grad_norm": 3.4216742515563965, |
|
"learning_rate": 8.187600974297714e-05, |
|
"loss": 0.7515, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5729657027572294, |
|
"grad_norm": 3.6286754608154297, |
|
"learning_rate": 8.102323211036904e-05, |
|
"loss": 0.6262, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5756556825823806, |
|
"grad_norm": 4.869045734405518, |
|
"learning_rate": 8.017188370776292e-05, |
|
"loss": 0.9851, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5783456624075319, |
|
"grad_norm": 5.106837749481201, |
|
"learning_rate": 7.932202865422726e-05, |
|
"loss": 1.122, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5810356422326832, |
|
"grad_norm": 4.490080833435059, |
|
"learning_rate": 7.847373095635937e-05, |
|
"loss": 0.8354, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5837256220578345, |
|
"grad_norm": 3.4671199321746826, |
|
"learning_rate": 7.762705450346462e-05, |
|
"loss": 0.8034, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5864156018829859, |
|
"grad_norm": 5.198472499847412, |
|
"learning_rate": 7.678206306274495e-05, |
|
"loss": 0.9827, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5891055817081372, |
|
"grad_norm": 3.910879135131836, |
|
"learning_rate": 7.59388202744959e-05, |
|
"loss": 0.5839, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5917955615332885, |
|
"grad_norm": 3.6499719619750977, |
|
"learning_rate": 7.509738964731389e-05, |
|
"loss": 0.4724, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5944855413584398, |
|
"grad_norm": 4.461284160614014, |
|
"learning_rate": 7.425783455331281e-05, |
|
"loss": 0.6628, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5971755211835911, |
|
"grad_norm": 3.570058584213257, |
|
"learning_rate": 7.342021822335143e-05, |
|
"loss": 0.455, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5998655010087425, |
|
"grad_norm": 4.93247652053833, |
|
"learning_rate": 7.258460374227085e-05, |
|
"loss": 1.1265, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6025554808338938, |
|
"grad_norm": 3.4849631786346436, |
|
"learning_rate": 7.175105404414362e-05, |
|
"loss": 0.3686, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.605245460659045, |
|
"grad_norm": 2.503511428833008, |
|
"learning_rate": 7.091963190753376e-05, |
|
"loss": 0.3223, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6079354404841963, |
|
"grad_norm": 4.293323040008545, |
|
"learning_rate": 7.009039995076844e-05, |
|
"loss": 0.5115, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6106254203093476, |
|
"grad_norm": 4.681167125701904, |
|
"learning_rate": 6.926342062722223e-05, |
|
"loss": 0.8861, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.613315400134499, |
|
"grad_norm": 4.363972187042236, |
|
"learning_rate": 6.843875622061304e-05, |
|
"loss": 0.714, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6160053799596503, |
|
"grad_norm": 4.711676597595215, |
|
"learning_rate": 6.761646884031164e-05, |
|
"loss": 0.9218, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6186953597848016, |
|
"grad_norm": 5.378860950469971, |
|
"learning_rate": 6.679662041666362e-05, |
|
"loss": 0.8466, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6213853396099529, |
|
"grad_norm": 4.945761203765869, |
|
"learning_rate": 6.597927269632526e-05, |
|
"loss": 0.8303, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6240753194351042, |
|
"grad_norm": 5.643571853637695, |
|
"learning_rate": 6.516448723761315e-05, |
|
"loss": 0.9418, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6267652992602556, |
|
"grad_norm": 3.7865352630615234, |
|
"learning_rate": 6.435232540586763e-05, |
|
"loss": 0.447, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6294552790854069, |
|
"grad_norm": 4.775557994842529, |
|
"learning_rate": 6.354284836883156e-05, |
|
"loss": 0.7068, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6321452589105582, |
|
"grad_norm": 4.313781261444092, |
|
"learning_rate": 6.273611709204304e-05, |
|
"loss": 0.4621, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6348352387357095, |
|
"grad_norm": 3.1749980449676514, |
|
"learning_rate": 6.193219233424414e-05, |
|
"loss": 0.2996, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6375252185608608, |
|
"grad_norm": 5.823219299316406, |
|
"learning_rate": 6.11311346428046e-05, |
|
"loss": 0.7987, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6402151983860122, |
|
"grad_norm": 4.825180530548096, |
|
"learning_rate": 6.033300434916203e-05, |
|
"loss": 0.9309, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6429051782111634, |
|
"grad_norm": 7.609663486480713, |
|
"learning_rate": 5.9537861564277654e-05, |
|
"loss": 0.6973, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6455951580363147, |
|
"grad_norm": 3.5007476806640625, |
|
"learning_rate": 5.8745766174109495e-05, |
|
"loss": 0.5831, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.648285137861466, |
|
"grad_norm": 4.732518196105957, |
|
"learning_rate": 5.795677783510187e-05, |
|
"loss": 0.447, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6509751176866173, |
|
"grad_norm": 4.874922752380371, |
|
"learning_rate": 5.7170955969692265e-05, |
|
"loss": 0.429, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6536650975117687, |
|
"grad_norm": 2.738624095916748, |
|
"learning_rate": 5.638835976183627e-05, |
|
"loss": 0.316, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.65635507733692, |
|
"grad_norm": 4.868707180023193, |
|
"learning_rate": 5.5609048152549794e-05, |
|
"loss": 0.528, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6590450571620713, |
|
"grad_norm": 2.545112371444702, |
|
"learning_rate": 5.483307983547026e-05, |
|
"loss": 0.2108, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6617350369872226, |
|
"grad_norm": 5.225026607513428, |
|
"learning_rate": 5.406051325243586e-05, |
|
"loss": 0.6883, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6644250168123739, |
|
"grad_norm": 6.781031608581543, |
|
"learning_rate": 5.329140658908423e-05, |
|
"loss": 0.7684, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6671149966375253, |
|
"grad_norm": 4.9204630851745605, |
|
"learning_rate": 5.2525817770470084e-05, |
|
"loss": 0.529, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6698049764626766, |
|
"grad_norm": 4.677596092224121, |
|
"learning_rate": 5.1763804456702545e-05, |
|
"loss": 0.1916, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6724949562878278, |
|
"grad_norm": 5.289106369018555, |
|
"learning_rate": 5.1005424038602724e-05, |
|
"loss": 0.5018, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6751849361129791, |
|
"grad_norm": 3.4332501888275146, |
|
"learning_rate": 5.025073363338111e-05, |
|
"loss": 1.4558, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6778749159381304, |
|
"grad_norm": 3.6167006492614746, |
|
"learning_rate": 4.949979008033596e-05, |
|
"loss": 1.3181, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6805648957632818, |
|
"grad_norm": 4.852591514587402, |
|
"learning_rate": 4.8752649936572304e-05, |
|
"loss": 1.5339, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6832548755884331, |
|
"grad_norm": 4.116457462310791, |
|
"learning_rate": 4.800936947274255e-05, |
|
"loss": 1.8746, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6859448554135844, |
|
"grad_norm": 3.9800620079040527, |
|
"learning_rate": 4.7270004668808397e-05, |
|
"loss": 1.8474, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6886348352387357, |
|
"grad_norm": 3.9870662689208984, |
|
"learning_rate": 4.65346112098246e-05, |
|
"loss": 1.148, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.691324815063887, |
|
"grad_norm": 4.0448503494262695, |
|
"learning_rate": 4.5803244481745275e-05, |
|
"loss": 1.0557, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6940147948890383, |
|
"grad_norm": 4.282130241394043, |
|
"learning_rate": 4.5075959567252335e-05, |
|
"loss": 1.2731, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6967047747141897, |
|
"grad_norm": 4.488638401031494, |
|
"learning_rate": 4.435281124160715e-05, |
|
"loss": 1.3722, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.699394754539341, |
|
"grad_norm": 3.9812207221984863, |
|
"learning_rate": 4.363385396852491e-05, |
|
"loss": 1.3536, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7020847343644923, |
|
"grad_norm": 5.047592639923096, |
|
"learning_rate": 4.291914189607297e-05, |
|
"loss": 0.7947, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7047747141896435, |
|
"grad_norm": 4.314056396484375, |
|
"learning_rate": 4.220872885259247e-05, |
|
"loss": 0.6368, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7074646940147948, |
|
"grad_norm": 4.209965705871582, |
|
"learning_rate": 4.1502668342644455e-05, |
|
"loss": 0.7786, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7101546738399462, |
|
"grad_norm": 5.0463151931762695, |
|
"learning_rate": 4.080101354298016e-05, |
|
"loss": 0.5957, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7128446536650975, |
|
"grad_norm": 5.0057373046875, |
|
"learning_rate": 4.0103817298535794e-05, |
|
"loss": 1.1632, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7155346334902488, |
|
"grad_norm": 4.640236854553223, |
|
"learning_rate": 3.9411132118452896e-05, |
|
"loss": 1.523, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7182246133154001, |
|
"grad_norm": 3.843372344970703, |
|
"learning_rate": 3.872301017212337e-05, |
|
"loss": 1.0536, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7209145931405514, |
|
"grad_norm": 4.180513858795166, |
|
"learning_rate": 3.8039503285260506e-05, |
|
"loss": 0.7683, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7236045729657028, |
|
"grad_norm": 7.624268054962158, |
|
"learning_rate": 3.73606629359955e-05, |
|
"loss": 0.817, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7262945527908541, |
|
"grad_norm": 5.006471633911133, |
|
"learning_rate": 3.6686540251000756e-05, |
|
"loss": 0.899, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7289845326160054, |
|
"grad_norm": 4.611303806304932, |
|
"learning_rate": 3.6017186001639036e-05, |
|
"loss": 0.3317, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7316745124411567, |
|
"grad_norm": 4.701048374176025, |
|
"learning_rate": 3.535265060013965e-05, |
|
"loss": 0.9114, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.734364492266308, |
|
"grad_norm": 5.010507583618164, |
|
"learning_rate": 3.4692984095801796e-05, |
|
"loss": 0.6747, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7370544720914594, |
|
"grad_norm": 4.114429473876953, |
|
"learning_rate": 3.4038236171224946e-05, |
|
"loss": 0.5408, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7397444519166106, |
|
"grad_norm": 5.027583122253418, |
|
"learning_rate": 3.3388456138567225e-05, |
|
"loss": 0.4589, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7424344317417619, |
|
"grad_norm": 5.864731788635254, |
|
"learning_rate": 3.274369293583121e-05, |
|
"loss": 0.9165, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7451244115669132, |
|
"grad_norm": 4.747946739196777, |
|
"learning_rate": 3.210399512317849e-05, |
|
"loss": 0.5087, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7478143913920645, |
|
"grad_norm": 3.576078414916992, |
|
"learning_rate": 3.146941087927203e-05, |
|
"loss": 0.5728, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7505043712172159, |
|
"grad_norm": 2.740264892578125, |
|
"learning_rate": 3.0839987997647935e-05, |
|
"loss": 0.2307, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7505043712172159, |
|
"eval_loss": 0.7389675974845886, |
|
"eval_runtime": 10.7313, |
|
"eval_samples_per_second": 14.63, |
|
"eval_steps_per_second": 7.362, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7531943510423672, |
|
"grad_norm": 3.4270105361938477, |
|
"learning_rate": 3.0215773883115706e-05, |
|
"loss": 0.5658, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7558843308675185, |
|
"grad_norm": 3.4167211055755615, |
|
"learning_rate": 2.9596815548187908e-05, |
|
"loss": 0.1781, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7585743106926698, |
|
"grad_norm": 3.9443657398223877, |
|
"learning_rate": 2.8983159609539635e-05, |
|
"loss": 0.5545, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7612642905178211, |
|
"grad_norm": 3.164463758468628, |
|
"learning_rate": 2.8374852284497446e-05, |
|
"loss": 0.334, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7639542703429725, |
|
"grad_norm": 3.6277055740356445, |
|
"learning_rate": 2.7771939387558554e-05, |
|
"loss": 0.411, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7666442501681238, |
|
"grad_norm": 4.296345233917236, |
|
"learning_rate": 2.717446632694025e-05, |
|
"loss": 0.3111, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.769334229993275, |
|
"grad_norm": 4.06040096282959, |
|
"learning_rate": 2.6582478101160167e-05, |
|
"loss": 0.4634, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7720242098184263, |
|
"grad_norm": 4.600436687469482, |
|
"learning_rate": 2.599601929564709e-05, |
|
"loss": 0.6998, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7747141896435776, |
|
"grad_norm": 3.8486735820770264, |
|
"learning_rate": 2.5415134079383006e-05, |
|
"loss": 0.3987, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.777404169468729, |
|
"grad_norm": 5.362851142883301, |
|
"learning_rate": 2.4839866201576646e-05, |
|
"loss": 0.3466, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7800941492938803, |
|
"grad_norm": 3.8688018321990967, |
|
"learning_rate": 2.4270258988368376e-05, |
|
"loss": 0.2902, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7827841291190316, |
|
"grad_norm": 4.354773044586182, |
|
"learning_rate": 2.3706355339567286e-05, |
|
"loss": 0.4149, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7854741089441829, |
|
"grad_norm": 7.11607027053833, |
|
"learning_rate": 2.3148197725419983e-05, |
|
"loss": 0.7291, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7881640887693342, |
|
"grad_norm": 5.43526029586792, |
|
"learning_rate": 2.2595828183412172e-05, |
|
"loss": 0.2716, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7908540685944856, |
|
"grad_norm": 3.004659414291382, |
|
"learning_rate": 2.2049288315102412e-05, |
|
"loss": 0.3067, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7935440484196369, |
|
"grad_norm": 4.5855560302734375, |
|
"learning_rate": 2.1508619282989084e-05, |
|
"loss": 0.2618, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7962340282447882, |
|
"grad_norm": 4.773977756500244, |
|
"learning_rate": 2.097386180741019e-05, |
|
"loss": 0.5023, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7989240080699395, |
|
"grad_norm": 9.166229248046875, |
|
"learning_rate": 2.0445056163476374e-05, |
|
"loss": 0.4224, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8016139878950908, |
|
"grad_norm": 6.276297092437744, |
|
"learning_rate": 1.9922242178037864e-05, |
|
"loss": 0.8068, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8043039677202422, |
|
"grad_norm": 5.523612976074219, |
|
"learning_rate": 1.940545922668472e-05, |
|
"loss": 0.4406, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8069939475453934, |
|
"grad_norm": 1.5128313302993774, |
|
"learning_rate": 1.88947462307814e-05, |
|
"loss": 0.0216, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8096839273705447, |
|
"grad_norm": 2.8309073448181152, |
|
"learning_rate": 1.8390141654535265e-05, |
|
"loss": 1.299, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.812373907195696, |
|
"grad_norm": 3.6739649772644043, |
|
"learning_rate": 1.789168350209983e-05, |
|
"loss": 1.5798, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8150638870208473, |
|
"grad_norm": 3.935307741165161, |
|
"learning_rate": 1.739940931471239e-05, |
|
"loss": 1.295, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8177538668459986, |
|
"grad_norm": 4.4844865798950195, |
|
"learning_rate": 1.6913356167866578e-05, |
|
"loss": 1.225, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.82044384667115, |
|
"grad_norm": 4.518765449523926, |
|
"learning_rate": 1.6433560668520176e-05, |
|
"loss": 1.4111, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8231338264963013, |
|
"grad_norm": 4.362013339996338, |
|
"learning_rate": 1.5960058952337887e-05, |
|
"loss": 1.1839, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8258238063214526, |
|
"grad_norm": 4.76102352142334, |
|
"learning_rate": 1.5492886680969963e-05, |
|
"loss": 1.2118, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8285137861466039, |
|
"grad_norm": 5.4755539894104, |
|
"learning_rate": 1.5032079039366209e-05, |
|
"loss": 1.4798, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8312037659717552, |
|
"grad_norm": 3.792975902557373, |
|
"learning_rate": 1.4577670733126203e-05, |
|
"loss": 0.7013, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8338937457969066, |
|
"grad_norm": 5.135954856872559, |
|
"learning_rate": 1.4129695985885228e-05, |
|
"loss": 1.5141, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8365837256220578, |
|
"grad_norm": 3.417525291442871, |
|
"learning_rate": 1.3688188536736968e-05, |
|
"loss": 0.8687, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8392737054472091, |
|
"grad_norm": 4.7601728439331055, |
|
"learning_rate": 1.3253181637692324e-05, |
|
"loss": 0.9127, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8419636852723604, |
|
"grad_norm": 4.601919174194336, |
|
"learning_rate": 1.2824708051175016e-05, |
|
"loss": 1.0878, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8446536650975117, |
|
"grad_norm": 3.320221185684204, |
|
"learning_rate": 1.2402800047554208e-05, |
|
"loss": 0.6061, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8473436449226631, |
|
"grad_norm": 4.236156463623047, |
|
"learning_rate": 1.1987489402713981e-05, |
|
"loss": 0.7456, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8500336247478144, |
|
"grad_norm": 6.007240295410156, |
|
"learning_rate": 1.1578807395660207e-05, |
|
"loss": 1.5298, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8527236045729657, |
|
"grad_norm": 5.775532245635986, |
|
"learning_rate": 1.1176784806164676e-05, |
|
"loss": 0.7343, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.855413584398117, |
|
"grad_norm": 5.709627628326416, |
|
"learning_rate": 1.078145191244706e-05, |
|
"loss": 1.2876, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8581035642232683, |
|
"grad_norm": 5.935501575469971, |
|
"learning_rate": 1.0392838488894463e-05, |
|
"loss": 0.9374, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8607935440484197, |
|
"grad_norm": 4.249516010284424, |
|
"learning_rate": 1.0010973803818857e-05, |
|
"loss": 0.5061, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.863483523873571, |
|
"grad_norm": 4.154758453369141, |
|
"learning_rate": 9.635886617252975e-06, |
|
"loss": 0.1188, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8661735036987223, |
|
"grad_norm": 3.874020576477051, |
|
"learning_rate": 9.267605178784033e-06, |
|
"loss": 0.4923, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8688634835238735, |
|
"grad_norm": 3.575878143310547, |
|
"learning_rate": 8.906157225426315e-06, |
|
"loss": 0.3217, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8715534633490248, |
|
"grad_norm": 4.050719261169434, |
|
"learning_rate": 8.55156997953197e-06, |
|
"loss": 0.4612, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8742434431741762, |
|
"grad_norm": 3.588498830795288, |
|
"learning_rate": 8.203870146740932e-06, |
|
"loss": 0.2259, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8769334229993275, |
|
"grad_norm": 5.262954235076904, |
|
"learning_rate": 7.86308391396956e-06, |
|
"loss": 0.7654, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8796234028244788, |
|
"grad_norm": 5.5735087394714355, |
|
"learning_rate": 7.529236947438256e-06, |
|
"loss": 0.5849, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8823133826496301, |
|
"grad_norm": 4.838580131530762, |
|
"learning_rate": 7.202354390738608e-06, |
|
"loss": 0.3913, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8850033624747814, |
|
"grad_norm": 5.6935038566589355, |
|
"learning_rate": 6.882460862939522e-06, |
|
"loss": 0.7206, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8876933422999328, |
|
"grad_norm": 2.3508174419403076, |
|
"learning_rate": 6.5695804567332044e-06, |
|
"loss": 0.1703, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8903833221250841, |
|
"grad_norm": 5.699828624725342, |
|
"learning_rate": 6.263736736620551e-06, |
|
"loss": 0.4676, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8930733019502354, |
|
"grad_norm": 4.048695087432861, |
|
"learning_rate": 5.964952737136353e-06, |
|
"loss": 0.5628, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8957632817753867, |
|
"grad_norm": 4.811221599578857, |
|
"learning_rate": 5.673250961114529e-06, |
|
"loss": 0.7418, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.898453261600538, |
|
"grad_norm": 3.3414437770843506, |
|
"learning_rate": 5.388653377993324e-06, |
|
"loss": 0.3143, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9011432414256894, |
|
"grad_norm": 5.924250602722168, |
|
"learning_rate": 5.111181422160671e-06, |
|
"loss": 0.5284, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9038332212508406, |
|
"grad_norm": 6.767046928405762, |
|
"learning_rate": 4.840855991339799e-06, |
|
"loss": 0.6351, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9065232010759919, |
|
"grad_norm": 4.555798053741455, |
|
"learning_rate": 4.577697445015472e-06, |
|
"loss": 0.5253, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9092131809011432, |
|
"grad_norm": 5.7803730964660645, |
|
"learning_rate": 4.321725602900473e-06, |
|
"loss": 0.7582, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9119031607262945, |
|
"grad_norm": 4.016640663146973, |
|
"learning_rate": 4.072959743443017e-06, |
|
"loss": 0.2845, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9145931405514459, |
|
"grad_norm": 5.46890926361084, |
|
"learning_rate": 3.83141860237467e-06, |
|
"loss": 0.6128, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9172831203765972, |
|
"grad_norm": 4.543710708618164, |
|
"learning_rate": 3.5971203712993894e-06, |
|
"loss": 0.5227, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9199731002017485, |
|
"grad_norm": 4.0189008712768555, |
|
"learning_rate": 3.3700826963233735e-06, |
|
"loss": 0.4072, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9226630800268998, |
|
"grad_norm": 5.0270490646362305, |
|
"learning_rate": 3.1503226767260252e-06, |
|
"loss": 0.5361, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9253530598520511, |
|
"grad_norm": 7.237580299377441, |
|
"learning_rate": 2.9378568636721835e-06, |
|
"loss": 0.9466, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9280430396772025, |
|
"grad_norm": 8.795455932617188, |
|
"learning_rate": 2.732701258965531e-06, |
|
"loss": 0.6604, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9307330195023538, |
|
"grad_norm": 11.6528959274292, |
|
"learning_rate": 2.5348713138434564e-06, |
|
"loss": 0.5807, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.933422999327505, |
|
"grad_norm": 8.07696533203125, |
|
"learning_rate": 2.3443819278132996e-06, |
|
"loss": 0.7975, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9361129791526563, |
|
"grad_norm": 4.788589954376221, |
|
"learning_rate": 2.161247447530268e-06, |
|
"loss": 0.6227, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9388029589778076, |
|
"grad_norm": 7.453376293182373, |
|
"learning_rate": 1.985481665716882e-06, |
|
"loss": 0.4651, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9414929388029589, |
|
"grad_norm": 4.3519392013549805, |
|
"learning_rate": 1.8170978201241474e-06, |
|
"loss": 0.1668, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9441829186281103, |
|
"grad_norm": 3.087855577468872, |
|
"learning_rate": 1.6561085925346332e-06, |
|
"loss": 1.2559, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9468728984532616, |
|
"grad_norm": 3.9484481811523438, |
|
"learning_rate": 1.5025261078073005e-06, |
|
"loss": 1.0505, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9495628782784129, |
|
"grad_norm": 4.509681701660156, |
|
"learning_rate": 1.3563619329643119e-06, |
|
"loss": 1.316, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.9522528581035642, |
|
"grad_norm": 4.409306049346924, |
|
"learning_rate": 1.2176270763198828e-06, |
|
"loss": 0.9114, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9549428379287155, |
|
"grad_norm": 5.652538299560547, |
|
"learning_rate": 1.0863319866512346e-06, |
|
"loss": 1.1458, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9576328177538669, |
|
"grad_norm": 6.170865535736084, |
|
"learning_rate": 9.624865524115346e-07, |
|
"loss": 1.1232, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9603227975790182, |
|
"grad_norm": 5.357152938842773, |
|
"learning_rate": 8.461001009852809e-07, |
|
"loss": 0.9592, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9630127774041695, |
|
"grad_norm": 4.322149753570557, |
|
"learning_rate": 7.371813979857312e-07, |
|
"loss": 0.7773, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9657027572293208, |
|
"grad_norm": 3.6123275756835938, |
|
"learning_rate": 6.357386465947301e-07, |
|
"loss": 0.5652, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.968392737054472, |
|
"grad_norm": 3.7311031818389893, |
|
"learning_rate": 5.417794869449377e-07, |
|
"loss": 0.6096, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9710827168796234, |
|
"grad_norm": 5.762843608856201, |
|
"learning_rate": 4.5531099554435576e-07, |
|
"loss": 0.9279, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9737726967047747, |
|
"grad_norm": 4.97388219833374, |
|
"learning_rate": 3.763396847433875e-07, |
|
"loss": 0.5789, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.976462676529926, |
|
"grad_norm": 4.815624713897705, |
|
"learning_rate": 3.048715022443749e-07, |
|
"loss": 0.5138, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9791526563550773, |
|
"grad_norm": 3.541781425476074, |
|
"learning_rate": 2.409118306536229e-07, |
|
"loss": 0.259, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9818426361802286, |
|
"grad_norm": 2.7444493770599365, |
|
"learning_rate": 1.8446548707604648e-07, |
|
"loss": 0.2707, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.98453261600538, |
|
"grad_norm": 5.796267986297607, |
|
"learning_rate": 1.3553672275230523e-07, |
|
"loss": 0.5347, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9872225958305313, |
|
"grad_norm": 5.090404987335205, |
|
"learning_rate": 9.412922273871471e-08, |
|
"loss": 0.3201, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9899125756556826, |
|
"grad_norm": 4.630456924438477, |
|
"learning_rate": 6.024610562962441e-08, |
|
"loss": 0.4391, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9926025554808339, |
|
"grad_norm": 4.325840473175049, |
|
"learning_rate": 3.388992332259422e-08, |
|
"loss": 0.3675, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9952925353059852, |
|
"grad_norm": 9.686969757080078, |
|
"learning_rate": 1.506266082615948e-08, |
|
"loss": 0.6909, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9979825151311366, |
|
"grad_norm": 4.668429851531982, |
|
"learning_rate": 3.7657361103837776e-09, |
|
"loss": 0.285, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.0013449899125757, |
|
"grad_norm": 4.755204200744629, |
|
"learning_rate": 0.0, |
|
"loss": 0.9972, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.0013449899125757, |
|
"eval_loss": 0.7320420145988464, |
|
"eval_runtime": 10.7106, |
|
"eval_samples_per_second": 14.658, |
|
"eval_steps_per_second": 7.376, |
|
"step": 372 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 372, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 93, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.211789436077998e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|