jssky commited on
Commit
3292db3
·
verified ·
1 Parent(s): 8d00a24

Training in progress, step 319, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fafa966272a0edeea5852168eebf98f2f28b4f21262d9c8c2914aa58b84518a9
3
  size 237546776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a2408b511d3e72357df79b7a07ad06ad9230d6c19b84c21feaf195f605743f
3
  size 237546776
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96fed01e3c67de8f8a17086f8c311754b44d682a7921ef641569d0361e6f2391
3
- size 223237336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9468afcac283b917fc2c10feb94175b8ad9038c8b7ba16949e23ec5bb2939333
3
+ size 223237720
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:212bb1137818292df860ec1c39a828962452eddcc2b94a4e9e878f38aa7f6ae3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e80c0bb7385d41535be59c31232d72d36d5990a511e813fe84cb14b3359947
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cf0ce84b55ef92ba1a34321398985dec341ef213dc7495da15b9effc346b4bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4222df64063a0da99a3d9b170b238ff21de6b814178c3cb5d1a90ab8274aa2a3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7541241162608012,
5
  "eval_steps": 80,
6
- "global_step": 240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1711,6 +1711,559 @@
1711
  "eval_samples_per_second": 43.102,
1712
  "eval_steps_per_second": 21.551,
1713
  "step": 240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1714
  }
1715
  ],
1716
  "logging_steps": 1,
@@ -1725,12 +2278,12 @@
1725
  "should_evaluate": false,
1726
  "should_log": false,
1727
  "should_save": true,
1728
- "should_training_stop": false
1729
  },
1730
  "attributes": {}
1731
  }
1732
  },
1733
- "total_flos": 2.029490427396096e+16,
1734
  "train_batch_size": 2,
1735
  "trial_name": null,
1736
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.002356637863315,
5
  "eval_steps": 80,
6
+ "global_step": 319,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1711
  "eval_samples_per_second": 43.102,
1712
  "eval_steps_per_second": 21.551,
1713
  "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.7572663000785546,
1717
+ "grad_norm": 4.266700744628906,
1718
+ "learning_rate": 2.98305571716907e-05,
1719
+ "loss": 1.1874,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.7604084838963079,
1724
+ "grad_norm": 5.862624168395996,
1725
+ "learning_rate": 2.9109822211964043e-05,
1726
+ "loss": 1.0646,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.7635506677140613,
1731
+ "grad_norm": 4.90443754196167,
1732
+ "learning_rate": 2.8396414908880098e-05,
1733
+ "loss": 1.3927,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.7666928515318147,
1738
+ "grad_norm": 6.397622585296631,
1739
+ "learning_rate": 2.769040900472488e-05,
1740
+ "loss": 1.2935,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.769835035349568,
1745
+ "grad_norm": 5.6756591796875,
1746
+ "learning_rate": 2.699187747672899e-05,
1747
+ "loss": 1.2046,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.7729772191673213,
1752
+ "grad_norm": 5.524738788604736,
1753
+ "learning_rate": 2.630089252952427e-05,
1754
+ "loss": 1.0745,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.7761194029850746,
1759
+ "grad_norm": 5.8002119064331055,
1760
+ "learning_rate": 2.5617525587680402e-05,
1761
+ "loss": 1.1491,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.779261586802828,
1766
+ "grad_norm": 6.8815484046936035,
1767
+ "learning_rate": 2.4941847288321797e-05,
1768
+ "loss": 1.5067,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.7824037706205813,
1773
+ "grad_norm": 7.29502010345459,
1774
+ "learning_rate": 2.427392747382623e-05,
1775
+ "loss": 1.5713,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.7855459544383346,
1780
+ "grad_norm": 8.774285316467285,
1781
+ "learning_rate": 2.3613835184605525e-05,
1782
+ "loss": 1.701,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.788688138256088,
1787
+ "grad_norm": 2.379939556121826,
1788
+ "learning_rate": 2.2961638651968975e-05,
1789
+ "loss": 1.0794,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.7918303220738413,
1794
+ "grad_norm": 3.104327440261841,
1795
+ "learning_rate": 2.231740529107057e-05,
1796
+ "loss": 1.4833,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.7949725058915946,
1801
+ "grad_norm": 2.5866000652313232,
1802
+ "learning_rate": 2.1681201693940668e-05,
1803
+ "loss": 1.2774,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.798114689709348,
1808
+ "grad_norm": 2.862161874771118,
1809
+ "learning_rate": 2.1053093622602404e-05,
1810
+ "loss": 1.2606,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.8012568735271013,
1815
+ "grad_norm": 3.9734158515930176,
1816
+ "learning_rate": 2.043314600227425e-05,
1817
+ "loss": 1.4184,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.8043990573448547,
1822
+ "grad_norm": 2.7739901542663574,
1823
+ "learning_rate": 1.982142291465896e-05,
1824
+ "loss": 1.0968,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.8075412411626081,
1829
+ "grad_norm": 2.9346413612365723,
1830
+ "learning_rate": 1.921798759131953e-05,
1831
+ "loss": 1.4055,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.8106834249803614,
1836
+ "grad_norm": 3.191915273666382,
1837
+ "learning_rate": 1.8622902407143394e-05,
1838
+ "loss": 1.2101,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.8138256087981147,
1843
+ "grad_norm": 3.06551456451416,
1844
+ "learning_rate": 1.8036228873894746e-05,
1845
+ "loss": 1.1478,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.816967792615868,
1850
+ "grad_norm": 3.3214335441589355,
1851
+ "learning_rate": 1.7458027633856478e-05,
1852
+ "loss": 1.2364,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.8201099764336214,
1857
+ "grad_norm": 3.1710729598999023,
1858
+ "learning_rate": 1.6888358453561648e-05,
1859
+ "loss": 1.0582,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.8232521602513747,
1864
+ "grad_norm": 3.025092363357544,
1865
+ "learning_rate": 1.6327280217615792e-05,
1866
+ "loss": 1.2433,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.826394344069128,
1871
+ "grad_norm": 3.2119741439819336,
1872
+ "learning_rate": 1.577485092261012e-05,
1873
+ "loss": 1.2365,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.8295365278868814,
1878
+ "grad_norm": 3.112516403198242,
1879
+ "learning_rate": 1.5231127671126677e-05,
1880
+ "loss": 1.0492,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.8326787117046347,
1885
+ "grad_norm": 2.9350504875183105,
1886
+ "learning_rate": 1.4696166665835853e-05,
1887
+ "loss": 1.1789,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.835820895522388,
1892
+ "grad_norm": 3.688014030456543,
1893
+ "learning_rate": 1.4170023203686878e-05,
1894
+ "loss": 1.3643,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.8389630793401414,
1899
+ "grad_norm": 3.518484115600586,
1900
+ "learning_rate": 1.3652751670192077e-05,
1901
+ "loss": 1.2082,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.8421052631578947,
1906
+ "grad_norm": 3.1499722003936768,
1907
+ "learning_rate": 1.3144405533805138e-05,
1908
+ "loss": 0.8437,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.845247446975648,
1913
+ "grad_norm": 3.3172833919525146,
1914
+ "learning_rate": 1.2645037340394284e-05,
1915
+ "loss": 1.1557,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.8483896307934015,
1920
+ "grad_norm": 3.544373035430908,
1921
+ "learning_rate": 1.215469870781093e-05,
1922
+ "loss": 1.1555,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.8515318146111548,
1927
+ "grad_norm": 3.739354133605957,
1928
+ "learning_rate": 1.167344032055394e-05,
1929
+ "loss": 1.3633,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.8546739984289081,
1934
+ "grad_norm": 3.5768179893493652,
1935
+ "learning_rate": 1.120131192453069e-05,
1936
+ "loss": 1.0143,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.8578161822466615,
1941
+ "grad_norm": 3.85493540763855,
1942
+ "learning_rate": 1.0738362321914997e-05,
1943
+ "loss": 1.3486,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.8609583660644148,
1948
+ "grad_norm": 3.8633315563201904,
1949
+ "learning_rate": 1.02846393661026e-05,
1950
+ "loss": 1.2283,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.8641005498821681,
1955
+ "grad_norm": 3.5946390628814697,
1956
+ "learning_rate": 9.840189956764677e-06,
1957
+ "loss": 1.2316,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.8672427336999214,
1962
+ "grad_norm": 4.018091201782227,
1963
+ "learning_rate": 9.405060035000135e-06,
1964
+ "loss": 1.0729,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.8703849175176748,
1969
+ "grad_norm": 3.750617265701294,
1970
+ "learning_rate": 8.979294578586738e-06,
1971
+ "loss": 0.983,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.8735271013354281,
1976
+ "grad_norm": 4.0061936378479,
1977
+ "learning_rate": 8.562937597331899e-06,
1978
+ "loss": 1.4571,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.8766692851531814,
1983
+ "grad_norm": 4.811827182769775,
1984
+ "learning_rate": 8.156032128523694e-06,
1985
+ "loss": 1.5216,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.8798114689709348,
1990
+ "grad_norm": 3.8524036407470703,
1991
+ "learning_rate": 7.758620232482084e-06,
1992
+ "loss": 1.1154,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.8829536527886881,
1997
+ "grad_norm": 4.497751712799072,
1998
+ "learning_rate": 7.370742988211365e-06,
1999
+ "loss": 1.3045,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.8860958366064414,
2004
+ "grad_norm": 3.6167821884155273,
2005
+ "learning_rate": 6.992440489154051e-06,
2006
+ "loss": 1.2766,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.8892380204241949,
2011
+ "grad_norm": 4.365938663482666,
2012
+ "learning_rate": 6.623751839046455e-06,
2013
+ "loss": 1.1343,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.8923802042419482,
2018
+ "grad_norm": 3.732172966003418,
2019
+ "learning_rate": 6.264715147876743e-06,
2020
+ "loss": 1.0509,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.8955223880597015,
2025
+ "grad_norm": 5.462162494659424,
2026
+ "learning_rate": 5.915367527945615e-06,
2027
+ "loss": 1.4873,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.8986645718774549,
2032
+ "grad_norm": 3.5439860820770264,
2033
+ "learning_rate": 5.575745090030138e-06,
2034
+ "loss": 0.9738,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.9018067556952082,
2039
+ "grad_norm": 5.106057167053223,
2040
+ "learning_rate": 5.245882939651181e-06,
2041
+ "loss": 1.6314,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.9049489395129615,
2046
+ "grad_norm": 4.478841781616211,
2047
+ "learning_rate": 4.92581517344457e-06,
2048
+ "loss": 1.3549,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.9080911233307148,
2053
+ "grad_norm": 4.549522876739502,
2054
+ "learning_rate": 4.61557487563673e-06,
2055
+ "loss": 1.1305,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.9112333071484682,
2060
+ "grad_norm": 5.698983669281006,
2061
+ "learning_rate": 4.315194114624888e-06,
2062
+ "loss": 1.2778,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.9143754909662215,
2067
+ "grad_norm": 5.114129066467285,
2068
+ "learning_rate": 4.0247039396622e-06,
2069
+ "loss": 1.364,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.9175176747839748,
2074
+ "grad_norm": 5.043893337249756,
2075
+ "learning_rate": 3.7441343776484117e-06,
2076
+ "loss": 1.6245,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.9206598586017282,
2081
+ "grad_norm": 6.31735897064209,
2082
+ "learning_rate": 3.473514430026026e-06,
2083
+ "loss": 1.0683,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.9238020424194815,
2088
+ "grad_norm": 5.165215015411377,
2089
+ "learning_rate": 3.212872069782513e-06,
2090
+ "loss": 1.6659,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.9269442262372348,
2095
+ "grad_norm": 4.504268169403076,
2096
+ "learning_rate": 2.9622342385589254e-06,
2097
+ "loss": 1.322,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.9300864100549883,
2102
+ "grad_norm": 5.05787467956543,
2103
+ "learning_rate": 2.7216268438649773e-06,
2104
+ "loss": 1.0772,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.9332285938727416,
2109
+ "grad_norm": 6.266513347625732,
2110
+ "learning_rate": 2.4910747564010685e-06,
2111
+ "loss": 1.1698,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.9363707776904949,
2116
+ "grad_norm": 5.642542362213135,
2117
+ "learning_rate": 2.2706018074875045e-06,
2118
+ "loss": 1.2399,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.9395129615082483,
2123
+ "grad_norm": 6.315053462982178,
2124
+ "learning_rate": 2.060230786601225e-06,
2125
+ "loss": 1.4454,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.9426551453260016,
2130
+ "grad_norm": 7.884403705596924,
2131
+ "learning_rate": 1.8599834390199855e-06,
2132
+ "loss": 1.3084,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.9457973291437549,
2137
+ "grad_norm": 2.967442512512207,
2138
+ "learning_rate": 1.6698804635747579e-06,
2139
+ "loss": 1.3336,
2140
+ "step": 301
2141
+ },
2142
+ {
2143
+ "epoch": 0.9489395129615082,
2144
+ "grad_norm": 3.1417999267578125,
2145
+ "learning_rate": 1.4899415105101067e-06,
2146
+ "loss": 1.3268,
2147
+ "step": 302
2148
+ },
2149
+ {
2150
+ "epoch": 0.9520816967792616,
2151
+ "grad_norm": 2.946615695953369,
2152
+ "learning_rate": 1.3201851794530373e-06,
2153
+ "loss": 1.4632,
2154
+ "step": 303
2155
+ },
2156
+ {
2157
+ "epoch": 0.9552238805970149,
2158
+ "grad_norm": 2.756666660308838,
2159
+ "learning_rate": 1.160629017490389e-06,
2160
+ "loss": 0.9778,
2161
+ "step": 304
2162
+ },
2163
+ {
2164
+ "epoch": 0.9583660644147682,
2165
+ "grad_norm": 3.684561252593994,
2166
+ "learning_rate": 1.0112895173551185e-06,
2167
+ "loss": 1.5126,
2168
+ "step": 305
2169
+ },
2170
+ {
2171
+ "epoch": 0.9615082482325216,
2172
+ "grad_norm": 3.060070514678955,
2173
+ "learning_rate": 8.721821157214316e-07,
2174
+ "loss": 1.1442,
2175
+ "step": 306
2176
+ },
2177
+ {
2178
+ "epoch": 0.9646504320502749,
2179
+ "grad_norm": 3.5914061069488525,
2180
+ "learning_rate": 7.433211916092142e-07,
2181
+ "loss": 0.918,
2182
+ "step": 307
2183
+ },
2184
+ {
2185
+ "epoch": 0.9677926158680282,
2186
+ "grad_norm": 3.4093003273010254,
2187
+ "learning_rate": 6.247200648976991e-07,
2188
+ "loss": 1.2567,
2189
+ "step": 308
2190
+ },
2191
+ {
2192
+ "epoch": 0.9709347996857817,
2193
+ "grad_norm": 4.0583600997924805,
2194
+ "learning_rate": 5.163909949486234e-07,
2195
+ "loss": 1.2847,
2196
+ "step": 309
2197
+ },
2198
+ {
2199
+ "epoch": 0.974076983503535,
2200
+ "grad_norm": 3.779723644256592,
2201
+ "learning_rate": 4.1834517933907467e-07,
2202
+ "loss": 1.0873,
2203
+ "step": 310
2204
+ },
2205
+ {
2206
+ "epoch": 0.9772191673212883,
2207
+ "grad_norm": 3.453876256942749,
2208
+ "learning_rate": 3.3059275270396207e-07,
2209
+ "loss": 1.092,
2210
+ "step": 311
2211
+ },
2212
+ {
2213
+ "epoch": 0.9803613511390417,
2214
+ "grad_norm": 3.9864912033081055,
2215
+ "learning_rate": 2.5314278568850935e-07,
2216
+ "loss": 1.1474,
2217
+ "step": 312
2218
+ },
2219
+ {
2220
+ "epoch": 0.983503534956795,
2221
+ "grad_norm": 3.627704381942749,
2222
+ "learning_rate": 1.8600328401061629e-07,
2223
+ "loss": 0.9844,
2224
+ "step": 313
2225
+ },
2226
+ {
2227
+ "epoch": 0.9866457187745483,
2228
+ "grad_norm": 3.843008041381836,
2229
+ "learning_rate": 1.2918118763335373e-07,
2230
+ "loss": 0.9021,
2231
+ "step": 314
2232
+ },
2233
+ {
2234
+ "epoch": 0.9897879025923016,
2235
+ "grad_norm": 5.430519104003906,
2236
+ "learning_rate": 8.268237004757096e-08,
2237
+ "loss": 1.1688,
2238
+ "step": 315
2239
+ },
2240
+ {
2241
+ "epoch": 0.992930086410055,
2242
+ "grad_norm": 5.529233932495117,
2243
+ "learning_rate": 4.651163766484779e-08,
2244
+ "loss": 1.309,
2245
+ "step": 316
2246
+ },
2247
+ {
2248
+ "epoch": 0.9960722702278083,
2249
+ "grad_norm": 6.51206636428833,
2250
+ "learning_rate": 2.0672729320581065e-08,
2251
+ "loss": 2.0086,
2252
+ "step": 317
2253
+ },
2254
+ {
2255
+ "epoch": 0.9992144540455616,
2256
+ "grad_norm": 5.897537708282471,
2257
+ "learning_rate": 5.1683158875937e-09,
2258
+ "loss": 0.881,
2259
+ "step": 318
2260
+ },
2261
+ {
2262
+ "epoch": 1.002356637863315,
2263
+ "grad_norm": 25.450618743896484,
2264
+ "learning_rate": 0.0,
2265
+ "loss": 3.048,
2266
+ "step": 319
2267
  }
2268
  ],
2269
  "logging_steps": 1,
 
2278
  "should_evaluate": false,
2279
  "should_log": false,
2280
  "should_save": true,
2281
+ "should_training_stop": true
2282
  },
2283
  "attributes": {}
2284
  }
2285
  },
2286
+ "total_flos": 2.696474000149709e+16,
2287
  "train_batch_size": 2,
2288
  "trial_name": null,
2289
  "trial_params": null