Romain-XV commited on
Commit
8203432
·
verified ·
1 Parent(s): dd4063b

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f5ab6b7d01e357faec8d24f5eeab288da927aeb0b4ae82502046b4bd35b750a
3
  size 161533192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d83d5f901eae11145cacd89c72231a010785bff7439a1ba5f4a5e94b8052b4
3
  size 161533192
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3beddf3d49e219d1affc71de13c23d27956429a4cabfa1b9a37828e9f9664d0
3
- size 82460660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1daa0c2c790a56b21b4c6c41ad6a3c0d997b0d94c85b459b9bd582601d994cca
3
+ size 82461044
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:204817fe3f4bf35aa4941a46f59c51c1b43bbff8a2da0f4838279ae11c91a700
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89f940dbe392540bac79082367808c07a597a4f60dff698b2c1ee1c27e1aabed
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83ebe7e36e83c1b5914c2a6daae1c4f326cee5b8b90231fe486f937787ce3706
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd8039035e9f22cc9e9230f66e4f1f1db2add3119ff8edfe450ef63eb16e5439
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.7198740243911743,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.07969516599008791,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,714 @@
1431
  "eval_samples_per_second": 14.017,
1432
  "eval_steps_per_second": 3.505,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1459,7 +2167,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 5.592309546614784e+17,
1463
  "train_batch_size": 4,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.700640082359314,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 0.11954274898513187,
5
  "eval_steps": 100,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 14.017,
1432
  "eval_steps_per_second": 3.505,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.08009364182003835,
1437
+ "grad_norm": 0.5120438933372498,
1438
+ "learning_rate": 0.00011964121814957137,
1439
+ "loss": 1.7626,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.08049211764998879,
1444
+ "grad_norm": 0.5081213712692261,
1445
+ "learning_rate": 0.00011893581990651848,
1446
+ "loss": 1.7664,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.08089059347993924,
1451
+ "grad_norm": 0.45303136110305786,
1452
+ "learning_rate": 0.00011822944303213486,
1453
+ "loss": 1.4845,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.08128906930988967,
1458
+ "grad_norm": 0.4521328806877136,
1459
+ "learning_rate": 0.00011752212403302784,
1460
+ "loss": 1.4534,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.08168754513984011,
1465
+ "grad_norm": 0.4870966970920563,
1466
+ "learning_rate": 0.00011681389946449504,
1467
+ "loss": 1.568,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.08208602096979055,
1472
+ "grad_norm": 0.5694287419319153,
1473
+ "learning_rate": 0.00011610480592863531,
1474
+ "loss": 1.9597,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.08248449679974099,
1479
+ "grad_norm": 0.5484179258346558,
1480
+ "learning_rate": 0.00011539488007245702,
1481
+ "loss": 1.8557,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.08288297262969144,
1486
+ "grad_norm": 0.44166100025177,
1487
+ "learning_rate": 0.00011468415858598411,
1488
+ "loss": 1.5371,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.08328144845964187,
1493
+ "grad_norm": 0.5365661978721619,
1494
+ "learning_rate": 0.00011397267820035986,
1495
+ "loss": 1.7778,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.08367992428959231,
1500
+ "grad_norm": 0.45911017060279846,
1501
+ "learning_rate": 0.00011326047568594851,
1502
+ "loss": 1.5729,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.08407840011954275,
1507
+ "grad_norm": 0.5417599678039551,
1508
+ "learning_rate": 0.00011254758785043515,
1509
+ "loss": 1.8296,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.08447687594949319,
1514
+ "grad_norm": 0.48594942688941956,
1515
+ "learning_rate": 0.0001118340515369232,
1516
+ "loss": 1.7837,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.08487535177944362,
1521
+ "grad_norm": 0.4888298511505127,
1522
+ "learning_rate": 0.00011111990362203033,
1523
+ "loss": 1.6575,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.08527382760939407,
1528
+ "grad_norm": 0.5313907265663147,
1529
+ "learning_rate": 0.00011040518101398276,
1530
+ "loss": 1.7803,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.08567230343934451,
1535
+ "grad_norm": 0.5065906643867493,
1536
+ "learning_rate": 0.00010968992065070769,
1537
+ "loss": 1.6539,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.08607077926929495,
1542
+ "grad_norm": 0.46294957399368286,
1543
+ "learning_rate": 0.00010897415949792427,
1544
+ "loss": 1.6412,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.08646925509924538,
1549
+ "grad_norm": 0.5068647861480713,
1550
+ "learning_rate": 0.00010825793454723325,
1551
+ "loss": 1.7173,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.08686773092919582,
1556
+ "grad_norm": 0.45219966769218445,
1557
+ "learning_rate": 0.0001075412828142051,
1558
+ "loss": 1.4531,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.08726620675914626,
1563
+ "grad_norm": 0.48035022616386414,
1564
+ "learning_rate": 0.0001068242413364671,
1565
+ "loss": 1.6187,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.08766468258909671,
1570
+ "grad_norm": 0.5463985204696655,
1571
+ "learning_rate": 0.00010610684717178905,
1572
+ "loss": 1.749,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.08806315841904715,
1577
+ "grad_norm": 0.4818764626979828,
1578
+ "learning_rate": 0.00010538913739616816,
1579
+ "loss": 1.4508,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.08846163424899758,
1584
+ "grad_norm": 0.5018213987350464,
1585
+ "learning_rate": 0.00010467114910191289,
1586
+ "loss": 1.6853,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.08886011007894802,
1591
+ "grad_norm": 0.5122075080871582,
1592
+ "learning_rate": 0.00010395291939572593,
1593
+ "loss": 1.6991,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.08925858590889846,
1598
+ "grad_norm": 0.48191148042678833,
1599
+ "learning_rate": 0.00010323448539678653,
1600
+ "loss": 1.6428,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.08965706173884891,
1605
+ "grad_norm": 0.4748276472091675,
1606
+ "learning_rate": 0.00010251588423483205,
1607
+ "loss": 1.7059,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.09005553756879935,
1612
+ "grad_norm": 0.5150067806243896,
1613
+ "learning_rate": 0.0001017971530482392,
1614
+ "loss": 1.7409,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.09045401339874978,
1619
+ "grad_norm": 0.5893855094909668,
1620
+ "learning_rate": 0.00010107832898210439,
1621
+ "loss": 1.7183,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.09085248922870022,
1626
+ "grad_norm": 0.5195055603981018,
1627
+ "learning_rate": 0.00010035944918632429,
1628
+ "loss": 1.8396,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.09125096505865066,
1633
+ "grad_norm": 0.5996953845024109,
1634
+ "learning_rate": 9.96405508136757e-05,
1635
+ "loss": 1.9944,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.0916494408886011,
1640
+ "grad_norm": 0.5057780146598816,
1641
+ "learning_rate": 9.892167101789564e-05,
1642
+ "loss": 1.6473,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.09204791671855155,
1647
+ "grad_norm": 0.46774283051490784,
1648
+ "learning_rate": 9.820284695176082e-05,
1649
+ "loss": 1.5973,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.09244639254850198,
1654
+ "grad_norm": 0.46982142329216003,
1655
+ "learning_rate": 9.748411576516794e-05,
1656
+ "loss": 1.6464,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.09284486837845242,
1661
+ "grad_norm": 0.4873621165752411,
1662
+ "learning_rate": 9.676551460321349e-05,
1663
+ "loss": 1.6629,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.09324334420840286,
1668
+ "grad_norm": 0.4866909682750702,
1669
+ "learning_rate": 9.60470806042741e-05,
1670
+ "loss": 1.6262,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.0936418200383533,
1675
+ "grad_norm": 0.5320809483528137,
1676
+ "learning_rate": 9.532885089808713e-05,
1677
+ "loss": 1.7158,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.09404029586830374,
1682
+ "grad_norm": 0.47346270084381104,
1683
+ "learning_rate": 9.461086260383187e-05,
1684
+ "loss": 1.6044,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.09443877169825418,
1689
+ "grad_norm": 0.5696609616279602,
1690
+ "learning_rate": 9.389315282821097e-05,
1691
+ "loss": 1.7883,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.09483724752820462,
1696
+ "grad_norm": 0.4926949441432953,
1697
+ "learning_rate": 9.317575866353292e-05,
1698
+ "loss": 1.7306,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.09523572335815506,
1703
+ "grad_norm": 0.5241943001747131,
1704
+ "learning_rate": 9.245871718579491e-05,
1705
+ "loss": 1.732,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.09563419918810549,
1710
+ "grad_norm": 0.5425236225128174,
1711
+ "learning_rate": 9.174206545276677e-05,
1712
+ "loss": 1.6209,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.09603267501805593,
1717
+ "grad_norm": 0.5216458439826965,
1718
+ "learning_rate": 9.102584050207578e-05,
1719
+ "loss": 1.74,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.09643115084800638,
1724
+ "grad_norm": 0.5082316994667053,
1725
+ "learning_rate": 9.031007934929236e-05,
1726
+ "loss": 1.6836,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.09682962667795682,
1731
+ "grad_norm": 0.48965132236480713,
1732
+ "learning_rate": 8.959481898601728e-05,
1733
+ "loss": 1.7055,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.09722810250790725,
1738
+ "grad_norm": 0.514946699142456,
1739
+ "learning_rate": 8.888009637796968e-05,
1740
+ "loss": 1.684,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.09762657833785769,
1745
+ "grad_norm": 0.551802396774292,
1746
+ "learning_rate": 8.81659484630768e-05,
1747
+ "loss": 1.8566,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.09802505416780813,
1752
+ "grad_norm": 0.4790934920310974,
1753
+ "learning_rate": 8.745241214956483e-05,
1754
+ "loss": 1.6461,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.09842352999775858,
1759
+ "grad_norm": 0.5450412631034851,
1760
+ "learning_rate": 8.673952431405148e-05,
1761
+ "loss": 1.7215,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.09882200582770902,
1766
+ "grad_norm": 0.5299497842788696,
1767
+ "learning_rate": 8.602732179964017e-05,
1768
+ "loss": 1.7454,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.09922048165765945,
1773
+ "grad_norm": 0.5010784268379211,
1774
+ "learning_rate": 8.531584141401591e-05,
1775
+ "loss": 1.6028,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.09961895748760989,
1780
+ "grad_norm": 0.4926188886165619,
1781
+ "learning_rate": 8.4605119927543e-05,
1782
+ "loss": 1.6837,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.10001743331756033,
1787
+ "grad_norm": 0.5703017115592957,
1788
+ "learning_rate": 8.38951940713647e-05,
1789
+ "loss": 1.8639,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.10041590914751077,
1794
+ "grad_norm": 0.5429261326789856,
1795
+ "learning_rate": 8.318610053550497e-05,
1796
+ "loss": 1.7258,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.10081438497746122,
1801
+ "grad_norm": 0.48338782787323,
1802
+ "learning_rate": 8.247787596697218e-05,
1803
+ "loss": 1.5873,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.10121286080741165,
1808
+ "grad_norm": 0.506877601146698,
1809
+ "learning_rate": 8.177055696786516e-05,
1810
+ "loss": 1.6736,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.10161133663736209,
1815
+ "grad_norm": 0.537820041179657,
1816
+ "learning_rate": 8.106418009348157e-05,
1817
+ "loss": 1.9075,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.10200981246731253,
1822
+ "grad_norm": 0.4729152023792267,
1823
+ "learning_rate": 8.035878185042868e-05,
1824
+ "loss": 1.5359,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.10240828829726296,
1829
+ "grad_norm": 0.4413747191429138,
1830
+ "learning_rate": 7.965439869473664e-05,
1831
+ "loss": 1.6245,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.10280676412721342,
1836
+ "grad_norm": 0.5398510694503784,
1837
+ "learning_rate": 7.895106702997437e-05,
1838
+ "loss": 1.6318,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.10320523995716385,
1843
+ "grad_norm": 0.5172785520553589,
1844
+ "learning_rate": 7.824882320536814e-05,
1845
+ "loss": 1.6601,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.10360371578711429,
1850
+ "grad_norm": 0.4824993908405304,
1851
+ "learning_rate": 7.754770351392311e-05,
1852
+ "loss": 1.5672,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.10400219161706473,
1857
+ "grad_norm": 0.4745709300041199,
1858
+ "learning_rate": 7.684774419054747e-05,
1859
+ "loss": 1.7128,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.10440066744701516,
1864
+ "grad_norm": 0.5071855783462524,
1865
+ "learning_rate": 7.614898141017996e-05,
1866
+ "loss": 1.7368,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.1047991432769656,
1871
+ "grad_norm": 0.5377690196037292,
1872
+ "learning_rate": 7.54514512859201e-05,
1873
+ "loss": 1.8659,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.10519761910691605,
1878
+ "grad_norm": 0.4762866199016571,
1879
+ "learning_rate": 7.475518986716194e-05,
1880
+ "loss": 1.6012,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.10559609493686649,
1885
+ "grad_norm": 0.46296924352645874,
1886
+ "learning_rate": 7.406023313773097e-05,
1887
+ "loss": 1.5484,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.10599457076681693,
1892
+ "grad_norm": 0.47845426201820374,
1893
+ "learning_rate": 7.336661701402439e-05,
1894
+ "loss": 1.6248,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.10639304659676736,
1899
+ "grad_norm": 0.48351001739501953,
1900
+ "learning_rate": 7.267437734315492e-05,
1901
+ "loss": 1.5549,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.1067915224267178,
1906
+ "grad_norm": 0.48554375767707825,
1907
+ "learning_rate": 7.198354990109805e-05,
1908
+ "loss": 1.5708,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.10718999825666825,
1913
+ "grad_norm": 0.47755077481269836,
1914
+ "learning_rate": 7.129417039084333e-05,
1915
+ "loss": 1.5864,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.10758847408661869,
1920
+ "grad_norm": 0.4970269799232483,
1921
+ "learning_rate": 7.060627444054893e-05,
1922
+ "loss": 1.6373,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.10798694991656912,
1927
+ "grad_norm": 0.47547978162765503,
1928
+ "learning_rate": 6.99198976017005e-05,
1929
+ "loss": 1.7433,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.10838542574651956,
1934
+ "grad_norm": 0.5408848524093628,
1935
+ "learning_rate": 6.923507534727373e-05,
1936
+ "loss": 1.77,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.10878390157647,
1941
+ "grad_norm": 0.49777430295944214,
1942
+ "learning_rate": 6.855184306990106e-05,
1943
+ "loss": 1.6071,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.10918237740642044,
1948
+ "grad_norm": 0.4691534638404846,
1949
+ "learning_rate": 6.78702360800425e-05,
1950
+ "loss": 1.5913,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.10958085323637089,
1955
+ "grad_norm": 0.5284269452095032,
1956
+ "learning_rate": 6.719028960416098e-05,
1957
+ "loss": 1.8038,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.10997932906632132,
1962
+ "grad_norm": 0.49061042070388794,
1963
+ "learning_rate": 6.651203878290139e-05,
1964
+ "loss": 1.5991,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.11037780489627176,
1969
+ "grad_norm": 0.5676330327987671,
1970
+ "learning_rate": 6.583551866927475e-05,
1971
+ "loss": 1.8924,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.1107762807262222,
1976
+ "grad_norm": 0.5392544865608215,
1977
+ "learning_rate": 6.516076422684654e-05,
1978
+ "loss": 1.7611,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.11117475655617264,
1983
+ "grad_norm": 0.5719506740570068,
1984
+ "learning_rate": 6.448781032792972e-05,
1985
+ "loss": 1.756,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.11157323238612307,
1990
+ "grad_norm": 0.4809233248233795,
1991
+ "learning_rate": 6.381669175178248e-05,
1992
+ "loss": 1.641,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.11197170821607352,
1997
+ "grad_norm": 0.48434188961982727,
1998
+ "learning_rate": 6.31474431828108e-05,
1999
+ "loss": 1.579,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.11237018404602396,
2004
+ "grad_norm": 0.5024405717849731,
2005
+ "learning_rate": 6.248009920877592e-05,
2006
+ "loss": 1.6653,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.1127686598759744,
2011
+ "grad_norm": 0.441279798746109,
2012
+ "learning_rate": 6.181469431900672e-05,
2013
+ "loss": 1.5105,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.11316713570592483,
2018
+ "grad_norm": 0.5233234763145447,
2019
+ "learning_rate": 6.115126290261745e-05,
2020
+ "loss": 1.7695,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.11356561153587527,
2025
+ "grad_norm": 0.5281261801719666,
2026
+ "learning_rate": 6.048983924673022e-05,
2027
+ "loss": 1.76,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.11396408736582572,
2032
+ "grad_norm": 0.534590482711792,
2033
+ "learning_rate": 5.983045753470308e-05,
2034
+ "loss": 1.8155,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.11436256319577616,
2039
+ "grad_norm": 0.5247072577476501,
2040
+ "learning_rate": 5.917315184436345e-05,
2041
+ "loss": 1.6073,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.1147610390257266,
2046
+ "grad_norm": 0.4829355776309967,
2047
+ "learning_rate": 5.851795614624682e-05,
2048
+ "loss": 1.5224,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.11515951485567703,
2053
+ "grad_norm": 0.516015887260437,
2054
+ "learning_rate": 5.786490430184115e-05,
2055
+ "loss": 1.6813,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.11555799068562747,
2060
+ "grad_norm": 0.48894891142845154,
2061
+ "learning_rate": 5.72140300618369e-05,
2062
+ "loss": 1.7965,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.11595646651557791,
2067
+ "grad_norm": 0.49149996042251587,
2068
+ "learning_rate": 5.656536706438267e-05,
2069
+ "loss": 1.6388,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.11635494234552836,
2074
+ "grad_norm": 0.4835774898529053,
2075
+ "learning_rate": 5.591894883334667e-05,
2076
+ "loss": 1.6856,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.1167534181754788,
2081
+ "grad_norm": 0.5278857946395874,
2082
+ "learning_rate": 5.5274808776584367e-05,
2083
+ "loss": 1.6883,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.11715189400542923,
2088
+ "grad_norm": 0.4995588958263397,
2089
+ "learning_rate": 5.463298018421171e-05,
2090
+ "loss": 1.519,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.11755036983537967,
2095
+ "grad_norm": 0.5236543416976929,
2096
+ "learning_rate": 5.399349622688479e-05,
2097
+ "loss": 1.7372,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.11794884566533011,
2102
+ "grad_norm": 0.45699524879455566,
2103
+ "learning_rate": 5.335638995408545e-05,
2104
+ "loss": 1.6082,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.11834732149528056,
2109
+ "grad_norm": 0.5191316604614258,
2110
+ "learning_rate": 5.272169429241325e-05,
2111
+ "loss": 1.7123,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.118745797325231,
2116
+ "grad_norm": 0.42880895733833313,
2117
+ "learning_rate": 5.208944204388377e-05,
2118
+ "loss": 1.4809,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.11914427315518143,
2123
+ "grad_norm": 0.5574065446853638,
2124
+ "learning_rate": 5.145966588423341e-05,
2125
+ "loss": 1.8128,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.11954274898513187,
2130
+ "grad_norm": 0.47847244143486023,
2131
+ "learning_rate": 5.0832398361230596e-05,
2132
+ "loss": 1.5699,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.11954274898513187,
2137
+ "eval_loss": 1.700640082359314,
2138
+ "eval_runtime": 603.0633,
2139
+ "eval_samples_per_second": 14.018,
2140
+ "eval_steps_per_second": 3.505,
2141
+ "step": 300
2142
  }
2143
  ],
2144
  "logging_steps": 1,
 
2167
  "attributes": {}
2168
  }
2169
  },
2170
+ "total_flos": 8.388464319922176e+17,
2171
  "train_batch_size": 4,
2172
  "trial_name": null,
2173
  "trial_params": null