aleegis12 commited on
Commit
1e578ba
·
verified ·
1 Parent(s): 33ef68e

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2ef37e66cf698968f2fda51a39cd65be386075e1290cd6d28862b2396af8133
3
  size 411094576
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6998651a4f6221c924c5f14c9847c85d7f4ff286da87b492a2c183aab4f8ae3
3
  size 411094576
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:991167d60948ccbda32d8519d13ed559556631258d027dad0b6ebf3160db254c
3
- size 209193332
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c709930fcd73234c8fd0636ff4ab5c31426d68d32e9fb3a01ebf09b79f50d61
3
+ size 209193780
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57857a501037bef88058a756500631b1ca55e504df91ef1fa8582774fac1d6f2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4c3d45bd5aae62c92c9ff393cc15dabbde2961fe98f16264940444bac07041
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:530505d607699f384741067a5f9139d72f043713adb680898a3f1b5714170c97
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebdb14d51e77eb18f9d6184de19bfac710da5493717593749289db85474b6091
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.9358128905296326,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.002658681759116786,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,714 @@
1431
  "eval_samples_per_second": 19.651,
1432
  "eval_steps_per_second": 4.913,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1459,7 +2167,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 1.2542533349828198e+17,
1463
  "train_batch_size": 8,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8697348237037659,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 0.003988022638675179,
5
  "eval_steps": 100,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 19.651,
1432
  "eval_steps_per_second": 4.913,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.00267197516791237,
1437
+ "grad_norm": 1.469771385192871,
1438
+ "learning_rate": 6.883142508466054e-05,
1439
+ "loss": 1.1812,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.0026852685767079537,
1444
+ "grad_norm": 1.9593838453292847,
1445
+ "learning_rate": 6.852787187549182e-05,
1446
+ "loss": 0.9595,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.0026985619855035376,
1451
+ "grad_norm": 1.881966471672058,
1452
+ "learning_rate": 6.82235249939575e-05,
1453
+ "loss": 1.0891,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.0027118553942991216,
1458
+ "grad_norm": 1.5613300800323486,
1459
+ "learning_rate": 6.7918397477265e-05,
1460
+ "loss": 1.1484,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.0027251488030947055,
1465
+ "grad_norm": 1.6164745092391968,
1466
+ "learning_rate": 6.761250239606169e-05,
1467
+ "loss": 0.8214,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.0027384422118902894,
1472
+ "grad_norm": 1.4947420358657837,
1473
+ "learning_rate": 6.730585285387465e-05,
1474
+ "loss": 0.9956,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.0027517356206858733,
1479
+ "grad_norm": 1.6902318000793457,
1480
+ "learning_rate": 6.699846198654971e-05,
1481
+ "loss": 0.8679,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.0027650290294814572,
1486
+ "grad_norm": 1.6245845556259155,
1487
+ "learning_rate": 6.669034296168855e-05,
1488
+ "loss": 0.9362,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.002778322438277041,
1493
+ "grad_norm": 1.687902569770813,
1494
+ "learning_rate": 6.638150897808468e-05,
1495
+ "loss": 0.9086,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.002791615847072625,
1500
+ "grad_norm": 1.6738238334655762,
1501
+ "learning_rate": 6.607197326515808e-05,
1502
+ "loss": 1.0715,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.002804909255868209,
1507
+ "grad_norm": 1.8981541395187378,
1508
+ "learning_rate": 6.57617490823885e-05,
1509
+ "loss": 1.0964,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.002818202664663793,
1514
+ "grad_norm": 1.994768500328064,
1515
+ "learning_rate": 6.545084971874738e-05,
1516
+ "loss": 1.0326,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.002831496073459377,
1521
+ "grad_norm": 1.953397274017334,
1522
+ "learning_rate": 6.513928849212873e-05,
1523
+ "loss": 0.9181,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.002844789482254961,
1528
+ "grad_norm": 1.5838758945465088,
1529
+ "learning_rate": 6.482707874877854e-05,
1530
+ "loss": 1.0109,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.002858082891050545,
1535
+ "grad_norm": 2.204489231109619,
1536
+ "learning_rate": 6.451423386272312e-05,
1537
+ "loss": 0.9583,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.002871376299846129,
1542
+ "grad_norm": 1.679416537284851,
1543
+ "learning_rate": 6.420076723519614e-05,
1544
+ "loss": 1.0152,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.002884669708641713,
1549
+ "grad_norm": 2.062034845352173,
1550
+ "learning_rate": 6.388669229406462e-05,
1551
+ "loss": 0.9058,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.002897963117437297,
1556
+ "grad_norm": 1.9006659984588623,
1557
+ "learning_rate": 6.357202249325371e-05,
1558
+ "loss": 0.8065,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.0029112565262328807,
1563
+ "grad_norm": 1.9091770648956299,
1564
+ "learning_rate": 6.32567713121704e-05,
1565
+ "loss": 0.8315,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.0029245499350284646,
1570
+ "grad_norm": 2.309170722961426,
1571
+ "learning_rate": 6.294095225512603e-05,
1572
+ "loss": 0.9633,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.0029378433438240485,
1577
+ "grad_norm": 2.036591053009033,
1578
+ "learning_rate": 6.26245788507579e-05,
1579
+ "loss": 0.9114,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.0029511367526196325,
1584
+ "grad_norm": 2.540407419204712,
1585
+ "learning_rate": 6.230766465144967e-05,
1586
+ "loss": 0.8928,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.0029644301614152164,
1591
+ "grad_norm": 2.447540044784546,
1592
+ "learning_rate": 6.199022323275083e-05,
1593
+ "loss": 0.8958,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.0029777235702108003,
1598
+ "grad_norm": 2.1421115398406982,
1599
+ "learning_rate": 6.167226819279528e-05,
1600
+ "loss": 0.9212,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.002991016979006384,
1605
+ "grad_norm": 2.2318172454833984,
1606
+ "learning_rate": 6.135381315171867e-05,
1607
+ "loss": 0.7849,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.003004310387801968,
1612
+ "grad_norm": 2.107386350631714,
1613
+ "learning_rate": 6.103487175107507e-05,
1614
+ "loss": 0.8173,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.003017603796597552,
1619
+ "grad_norm": 2.08105206489563,
1620
+ "learning_rate": 6.071545765325254e-05,
1621
+ "loss": 0.9439,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.003030897205393136,
1626
+ "grad_norm": 2.376014471054077,
1627
+ "learning_rate": 6.0395584540887963e-05,
1628
+ "loss": 0.9148,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.00304419061418872,
1633
+ "grad_norm": 2.1427810192108154,
1634
+ "learning_rate": 6.007526611628086e-05,
1635
+ "loss": 0.8044,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.0030574840229843038,
1640
+ "grad_norm": 2.6007277965545654,
1641
+ "learning_rate": 5.9754516100806423e-05,
1642
+ "loss": 0.8667,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.0030707774317798877,
1647
+ "grad_norm": 2.6047568321228027,
1648
+ "learning_rate": 5.9433348234327765e-05,
1649
+ "loss": 0.8761,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.0030840708405754716,
1654
+ "grad_norm": 2.6537301540374756,
1655
+ "learning_rate": 5.911177627460739e-05,
1656
+ "loss": 1.0008,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.0030973642493710555,
1661
+ "grad_norm": 2.5320510864257812,
1662
+ "learning_rate": 5.8789813996717736e-05,
1663
+ "loss": 0.9914,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.0031106576581666394,
1668
+ "grad_norm": 2.1913628578186035,
1669
+ "learning_rate": 5.8467475192451226e-05,
1670
+ "loss": 0.8323,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.0031239510669622233,
1675
+ "grad_norm": 2.716449737548828,
1676
+ "learning_rate": 5.814477366972945e-05,
1677
+ "loss": 0.885,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.0031372444757578073,
1682
+ "grad_norm": 3.720883846282959,
1683
+ "learning_rate": 5.782172325201155e-05,
1684
+ "loss": 1.0299,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.003150537884553391,
1689
+ "grad_norm": 2.6181509494781494,
1690
+ "learning_rate": 5.749833777770225e-05,
1691
+ "loss": 0.8591,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.003163831293348975,
1696
+ "grad_norm": 2.382664918899536,
1697
+ "learning_rate": 5.717463109955896e-05,
1698
+ "loss": 0.8851,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.003177124702144559,
1703
+ "grad_norm": 2.3294265270233154,
1704
+ "learning_rate": 5.685061708409841e-05,
1705
+ "loss": 0.7961,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.003190418110940143,
1710
+ "grad_norm": 3.5375571250915527,
1711
+ "learning_rate": 5.6526309611002594e-05,
1712
+ "loss": 0.9115,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.003203711519735727,
1717
+ "grad_norm": 2.9233639240264893,
1718
+ "learning_rate": 5.6201722572524275e-05,
1719
+ "loss": 0.7504,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.003217004928531311,
1724
+ "grad_norm": 3.2264790534973145,
1725
+ "learning_rate": 5.587686987289189e-05,
1726
+ "loss": 0.7414,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.003230298337326895,
1731
+ "grad_norm": 2.921107530593872,
1732
+ "learning_rate": 5.5551765427713884e-05,
1733
+ "loss": 0.5761,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.003243591746122479,
1738
+ "grad_norm": 3.3718862533569336,
1739
+ "learning_rate": 5.522642316338268e-05,
1740
+ "loss": 0.8256,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.003256885154918063,
1745
+ "grad_norm": 2.911158800125122,
1746
+ "learning_rate": 5.490085701647805e-05,
1747
+ "loss": 0.7299,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.003270178563713647,
1752
+ "grad_norm": 3.8199920654296875,
1753
+ "learning_rate": 5.457508093317013e-05,
1754
+ "loss": 0.7782,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.0032834719725092307,
1759
+ "grad_norm": 3.6611592769622803,
1760
+ "learning_rate": 5.4249108868622086e-05,
1761
+ "loss": 0.7346,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.0032967653813048147,
1766
+ "grad_norm": 3.3983826637268066,
1767
+ "learning_rate": 5.392295478639225e-05,
1768
+ "loss": 0.68,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.0033100587901003986,
1773
+ "grad_norm": 4.487311840057373,
1774
+ "learning_rate": 5.359663265783598e-05,
1775
+ "loss": 0.7677,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.0033233521988959825,
1780
+ "grad_norm": 5.948139667510986,
1781
+ "learning_rate": 5.327015646150716e-05,
1782
+ "loss": 0.7702,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.0033366456076915664,
1787
+ "grad_norm": 1.1044285297393799,
1788
+ "learning_rate": 5.294354018255945e-05,
1789
+ "loss": 0.9482,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.0033499390164871503,
1794
+ "grad_norm": 1.6621804237365723,
1795
+ "learning_rate": 5.26167978121472e-05,
1796
+ "loss": 0.8978,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.0033632324252827342,
1801
+ "grad_norm": 1.7893524169921875,
1802
+ "learning_rate": 5.228994334682604e-05,
1803
+ "loss": 0.9667,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.003376525834078318,
1808
+ "grad_norm": 1.6742503643035889,
1809
+ "learning_rate": 5.196299078795344e-05,
1810
+ "loss": 0.8508,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.003389819242873902,
1815
+ "grad_norm": 1.6336678266525269,
1816
+ "learning_rate": 5.1635954141088813e-05,
1817
+ "loss": 1.0183,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.003403112651669486,
1822
+ "grad_norm": 1.4634300470352173,
1823
+ "learning_rate": 5.1308847415393666e-05,
1824
+ "loss": 0.8325,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.00341640606046507,
1829
+ "grad_norm": 1.8752628564834595,
1830
+ "learning_rate": 5.0981684623031415e-05,
1831
+ "loss": 0.8682,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.003429699469260654,
1836
+ "grad_norm": 1.516921043395996,
1837
+ "learning_rate": 5.0654479778567223e-05,
1838
+ "loss": 0.87,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.0034429928780562377,
1843
+ "grad_norm": 1.953859567642212,
1844
+ "learning_rate": 5.0327246898367597e-05,
1845
+ "loss": 1.0406,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.0034562862868518216,
1850
+ "grad_norm": 1.692104697227478,
1851
+ "learning_rate": 5e-05,
1852
+ "loss": 1.0228,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.0034695796956474055,
1857
+ "grad_norm": 1.6310532093048096,
1858
+ "learning_rate": 4.9672753101632415e-05,
1859
+ "loss": 1.0342,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.0034828731044429895,
1864
+ "grad_norm": 1.8367486000061035,
1865
+ "learning_rate": 4.934552022143279e-05,
1866
+ "loss": 0.8968,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.0034961665132385734,
1871
+ "grad_norm": 1.6751683950424194,
1872
+ "learning_rate": 4.901831537696859e-05,
1873
+ "loss": 0.9189,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.0035094599220341573,
1878
+ "grad_norm": 1.8059558868408203,
1879
+ "learning_rate": 4.869115258460635e-05,
1880
+ "loss": 0.9239,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.003522753330829741,
1885
+ "grad_norm": 1.7002429962158203,
1886
+ "learning_rate": 4.83640458589112e-05,
1887
+ "loss": 0.8102,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.003536046739625325,
1892
+ "grad_norm": 1.738466501235962,
1893
+ "learning_rate": 4.8037009212046586e-05,
1894
+ "loss": 1.0022,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.003549340148420909,
1899
+ "grad_norm": 2.1397879123687744,
1900
+ "learning_rate": 4.7710056653173976e-05,
1901
+ "loss": 0.8882,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.003562633557216493,
1906
+ "grad_norm": 2.1561543941497803,
1907
+ "learning_rate": 4.738320218785281e-05,
1908
+ "loss": 0.9589,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.003575926966012077,
1913
+ "grad_norm": 2.116396427154541,
1914
+ "learning_rate": 4.7056459817440544e-05,
1915
+ "loss": 0.8827,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.003589220374807661,
1920
+ "grad_norm": 2.172356367111206,
1921
+ "learning_rate": 4.6729843538492847e-05,
1922
+ "loss": 0.9613,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.003602513783603245,
1927
+ "grad_norm": 1.7960728406906128,
1928
+ "learning_rate": 4.640336734216403e-05,
1929
+ "loss": 0.8781,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.003615807192398829,
1934
+ "grad_norm": 1.8511698246002197,
1935
+ "learning_rate": 4.607704521360776e-05,
1936
+ "loss": 0.8647,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.003629100601194413,
1941
+ "grad_norm": 2.0229597091674805,
1942
+ "learning_rate": 4.575089113137792e-05,
1943
+ "loss": 1.004,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.003642394009989997,
1948
+ "grad_norm": 2.3480939865112305,
1949
+ "learning_rate": 4.542491906682989e-05,
1950
+ "loss": 1.0119,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.0036556874187855808,
1955
+ "grad_norm": 1.9140769243240356,
1956
+ "learning_rate": 4.509914298352197e-05,
1957
+ "loss": 0.8089,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.0036689808275811647,
1962
+ "grad_norm": 1.9026778936386108,
1963
+ "learning_rate": 4.477357683661734e-05,
1964
+ "loss": 0.7569,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.0036822742363767486,
1969
+ "grad_norm": 2.2026915550231934,
1970
+ "learning_rate": 4.444823457228612e-05,
1971
+ "loss": 0.8855,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.0036955676451723325,
1976
+ "grad_norm": 2.2045536041259766,
1977
+ "learning_rate": 4.412313012710813e-05,
1978
+ "loss": 0.8306,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.0037088610539679164,
1983
+ "grad_norm": 2.3194525241851807,
1984
+ "learning_rate": 4.379827742747575e-05,
1985
+ "loss": 0.8154,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.0037221544627635004,
1990
+ "grad_norm": 2.6129322052001953,
1991
+ "learning_rate": 4.347369038899744e-05,
1992
+ "loss": 0.9187,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.0037354478715590843,
1997
+ "grad_norm": 2.2725565433502197,
1998
+ "learning_rate": 4.3149382915901606e-05,
1999
+ "loss": 0.793,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.003748741280354668,
2004
+ "grad_norm": 2.23557186126709,
2005
+ "learning_rate": 4.282536890044104e-05,
2006
+ "loss": 0.8578,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.003762034689150252,
2011
+ "grad_norm": 2.394350290298462,
2012
+ "learning_rate": 4.250166222229774e-05,
2013
+ "loss": 0.8555,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.003775328097945836,
2018
+ "grad_norm": 2.935340642929077,
2019
+ "learning_rate": 4.2178276747988446e-05,
2020
+ "loss": 0.852,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.00378862150674142,
2025
+ "grad_norm": 3.061005115509033,
2026
+ "learning_rate": 4.185522633027057e-05,
2027
+ "loss": 0.8782,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.003801914915537004,
2032
+ "grad_norm": 2.7278201580047607,
2033
+ "learning_rate": 4.153252480754877e-05,
2034
+ "loss": 0.8451,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.0038152083243325878,
2039
+ "grad_norm": 2.643934488296509,
2040
+ "learning_rate": 4.1210186003282275e-05,
2041
+ "loss": 0.7744,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.0038285017331281717,
2046
+ "grad_norm": 3.218041181564331,
2047
+ "learning_rate": 4.088822372539263e-05,
2048
+ "loss": 0.8261,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.0038417951419237556,
2053
+ "grad_norm": 2.650805950164795,
2054
+ "learning_rate": 4.0566651765672246e-05,
2055
+ "loss": 0.8542,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.0038550885507193395,
2060
+ "grad_norm": 2.679455041885376,
2061
+ "learning_rate": 4.0245483899193595e-05,
2062
+ "loss": 0.8374,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.0038683819595149234,
2067
+ "grad_norm": 3.0015034675598145,
2068
+ "learning_rate": 3.992473388371915e-05,
2069
+ "loss": 0.9829,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.0038816753683105073,
2074
+ "grad_norm": 3.549671173095703,
2075
+ "learning_rate": 3.960441545911204e-05,
2076
+ "loss": 0.8356,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.0038949687771060912,
2081
+ "grad_norm": 3.1017885208129883,
2082
+ "learning_rate": 3.928454234674747e-05,
2083
+ "loss": 0.8398,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.003908262185901675,
2088
+ "grad_norm": 3.0585556030273438,
2089
+ "learning_rate": 3.896512824892495e-05,
2090
+ "loss": 0.7647,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.0039215555946972595,
2095
+ "grad_norm": 3.385533571243286,
2096
+ "learning_rate": 3.864618684828134e-05,
2097
+ "loss": 0.6782,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.003934849003492843,
2102
+ "grad_norm": 5.169503211975098,
2103
+ "learning_rate": 3.832773180720475e-05,
2104
+ "loss": 0.7389,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.003948142412288427,
2109
+ "grad_norm": 2.9151158332824707,
2110
+ "learning_rate": 3.800977676724919e-05,
2111
+ "loss": 0.6878,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.003961435821084011,
2116
+ "grad_norm": 3.42378568649292,
2117
+ "learning_rate": 3.769233534855035e-05,
2118
+ "loss": 0.6473,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.003974729229879595,
2123
+ "grad_norm": 4.071504592895508,
2124
+ "learning_rate": 3.73754211492421e-05,
2125
+ "loss": 0.8104,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.003988022638675179,
2130
+ "grad_norm": 5.344113349914551,
2131
+ "learning_rate": 3.705904774487396e-05,
2132
+ "loss": 0.9578,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.003988022638675179,
2137
+ "eval_loss": 0.8697348237037659,
2138
+ "eval_runtime": 6447.6628,
2139
+ "eval_samples_per_second": 19.65,
2140
+ "eval_steps_per_second": 4.912,
2141
+ "step": 300
2142
  }
2143
  ],
2144
  "logging_steps": 1,
 
2167
  "attributes": {}
2168
  }
2169
  },
2170
+ "total_flos": 1.880597072801956e+17,
2171
  "train_batch_size": 8,
2172
  "trial_name": null,
2173
  "trial_params": null