whiteapple8222 commited on
Commit
8ca59fc
·
verified ·
1 Parent(s): 83f97e9

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:271a1bc981038a2e9a2be89f59b086109d0acdc161e99c620f8d27a304cd854a
3
  size 556856304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80abaf385de52296916718e681935c6768a0a32c4d5289ee386f17e1664b163f
3
  size 556856304
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3d999bd034df420c2d4748e53b52965d2c7e75c692a0b66bf826ca3f10ee1c8
3
  size 21599316
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:580ac5ff6f77821425e037ad3c74c0bfd112859b028a2db428feb4839651d2af
3
  size 21599316
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfdf4d2c7d06a45608d3aefb664ef7c0b1e7c1768d6d28419ae2acb17d46d34a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486b6099b25fd8543d7c61a0c8411551172858931459e611b5edfdb1e356f30a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:344239db3127022ab4555da8a93e03326ad98987159e0d995028c1aed64245ed
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7650a95ad72c1e724156e425d1136ef779a743435f328ed6c165002f9e288809
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.012315504272532637,
5
  "eval_steps": 50,
6
- "global_step": 650,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4669,6 +4669,364 @@
4669
  "eval_samples_per_second": 40.017,
4670
  "eval_steps_per_second": 20.009,
4671
  "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4672
  }
4673
  ],
4674
  "logging_steps": 1,
@@ -4688,7 +5046,7 @@
4688
  "attributes": {}
4689
  }
4690
  },
4691
- "total_flos": 6.93432181850112e+16,
4692
  "train_batch_size": 2,
4693
  "trial_name": null,
4694
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.013262850755035147,
5
  "eval_steps": 50,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4669
  "eval_samples_per_second": 40.017,
4670
  "eval_steps_per_second": 20.009,
4671
  "step": 650
4672
+ },
4673
+ {
4674
+ "epoch": 0.012334451202182687,
4675
+ "grad_norm": 0.13303719460964203,
4676
+ "learning_rate": 0.0001999999994930954,
4677
+ "loss": 0.0354,
4678
+ "step": 651
4679
+ },
4680
+ {
4681
+ "epoch": 0.012353398131832737,
4682
+ "grad_norm": 0.15215599536895752,
4683
+ "learning_rate": 0.00019999999949151252,
4684
+ "loss": 0.0312,
4685
+ "step": 652
4686
+ },
4687
+ {
4688
+ "epoch": 0.012372345061482787,
4689
+ "grad_norm": 0.12846222519874573,
4690
+ "learning_rate": 0.00019999999948992723,
4691
+ "loss": 0.0323,
4692
+ "step": 653
4693
+ },
4694
+ {
4695
+ "epoch": 0.012391291991132838,
4696
+ "grad_norm": 0.621687650680542,
4697
+ "learning_rate": 0.00019999999948833947,
4698
+ "loss": 0.0343,
4699
+ "step": 654
4700
+ },
4701
+ {
4702
+ "epoch": 0.012410238920782888,
4703
+ "grad_norm": 0.7231767773628235,
4704
+ "learning_rate": 0.00019999999948674924,
4705
+ "loss": 0.0284,
4706
+ "step": 655
4707
+ },
4708
+ {
4709
+ "epoch": 0.012429185850432938,
4710
+ "grad_norm": 0.07879934459924698,
4711
+ "learning_rate": 0.0001999999994851565,
4712
+ "loss": 0.0335,
4713
+ "step": 656
4714
+ },
4715
+ {
4716
+ "epoch": 0.012448132780082987,
4717
+ "grad_norm": 0.08739863336086273,
4718
+ "learning_rate": 0.00019999999948356134,
4719
+ "loss": 0.0412,
4720
+ "step": 657
4721
+ },
4722
+ {
4723
+ "epoch": 0.012467079709733037,
4724
+ "grad_norm": 0.14491000771522522,
4725
+ "learning_rate": 0.00019999999948196369,
4726
+ "loss": 0.0265,
4727
+ "step": 658
4728
+ },
4729
+ {
4730
+ "epoch": 0.012486026639383087,
4731
+ "grad_norm": 0.09429813921451569,
4732
+ "learning_rate": 0.0001999999994803636,
4733
+ "loss": 0.0246,
4734
+ "step": 659
4735
+ },
4736
+ {
4737
+ "epoch": 0.012504973569033137,
4738
+ "grad_norm": 0.282419353723526,
4739
+ "learning_rate": 0.000199999999478761,
4740
+ "loss": 0.0805,
4741
+ "step": 660
4742
+ },
4743
+ {
4744
+ "epoch": 0.012523920498683188,
4745
+ "grad_norm": 0.1799023151397705,
4746
+ "learning_rate": 0.00019999999947715598,
4747
+ "loss": 0.0264,
4748
+ "step": 661
4749
+ },
4750
+ {
4751
+ "epoch": 0.012542867428333238,
4752
+ "grad_norm": 0.18289320170879364,
4753
+ "learning_rate": 0.00019999999947554846,
4754
+ "loss": 0.0493,
4755
+ "step": 662
4756
+ },
4757
+ {
4758
+ "epoch": 0.012561814357983288,
4759
+ "grad_norm": 0.23105517029762268,
4760
+ "learning_rate": 0.00019999999947393845,
4761
+ "loss": 0.0388,
4762
+ "step": 663
4763
+ },
4764
+ {
4765
+ "epoch": 0.012580761287633338,
4766
+ "grad_norm": 0.19387099146842957,
4767
+ "learning_rate": 0.00019999999947232602,
4768
+ "loss": 0.0294,
4769
+ "step": 664
4770
+ },
4771
+ {
4772
+ "epoch": 0.012599708217283389,
4773
+ "grad_norm": 0.07087501883506775,
4774
+ "learning_rate": 0.00019999999947071108,
4775
+ "loss": 0.0219,
4776
+ "step": 665
4777
+ },
4778
+ {
4779
+ "epoch": 0.012618655146933439,
4780
+ "grad_norm": 0.3370932936668396,
4781
+ "learning_rate": 0.00019999999946909372,
4782
+ "loss": 0.0639,
4783
+ "step": 666
4784
+ },
4785
+ {
4786
+ "epoch": 0.012637602076583489,
4787
+ "grad_norm": 0.22018681466579437,
4788
+ "learning_rate": 0.00019999999946747387,
4789
+ "loss": 0.0441,
4790
+ "step": 667
4791
+ },
4792
+ {
4793
+ "epoch": 0.01265654900623354,
4794
+ "grad_norm": 0.06425543874502182,
4795
+ "learning_rate": 0.00019999999946585155,
4796
+ "loss": 0.0174,
4797
+ "step": 668
4798
+ },
4799
+ {
4800
+ "epoch": 0.01267549593588359,
4801
+ "grad_norm": 0.26767414808273315,
4802
+ "learning_rate": 0.00019999999946422676,
4803
+ "loss": 0.0542,
4804
+ "step": 669
4805
+ },
4806
+ {
4807
+ "epoch": 0.01269444286553364,
4808
+ "grad_norm": 0.19884276390075684,
4809
+ "learning_rate": 0.0001999999994625995,
4810
+ "loss": 0.0294,
4811
+ "step": 670
4812
+ },
4813
+ {
4814
+ "epoch": 0.01271338979518369,
4815
+ "grad_norm": 0.17157883942127228,
4816
+ "learning_rate": 0.0001999999994609698,
4817
+ "loss": 0.0324,
4818
+ "step": 671
4819
+ },
4820
+ {
4821
+ "epoch": 0.01273233672483374,
4822
+ "grad_norm": 0.07747308164834976,
4823
+ "learning_rate": 0.0001999999994593376,
4824
+ "loss": 0.0285,
4825
+ "step": 672
4826
+ },
4827
+ {
4828
+ "epoch": 0.01275128365448379,
4829
+ "grad_norm": 0.06676504760980606,
4830
+ "learning_rate": 0.00019999999945770295,
4831
+ "loss": 0.0238,
4832
+ "step": 673
4833
+ },
4834
+ {
4835
+ "epoch": 0.01277023058413384,
4836
+ "grad_norm": 0.3494616448879242,
4837
+ "learning_rate": 0.00019999999945606584,
4838
+ "loss": 0.0369,
4839
+ "step": 674
4840
+ },
4841
+ {
4842
+ "epoch": 0.012789177513783891,
4843
+ "grad_norm": 0.3873502016067505,
4844
+ "learning_rate": 0.00019999999945442625,
4845
+ "loss": 0.0668,
4846
+ "step": 675
4847
+ },
4848
+ {
4849
+ "epoch": 0.012808124443433941,
4850
+ "grad_norm": 0.12122321873903275,
4851
+ "learning_rate": 0.00019999999945278417,
4852
+ "loss": 0.034,
4853
+ "step": 676
4854
+ },
4855
+ {
4856
+ "epoch": 0.012827071373083991,
4857
+ "grad_norm": 0.15738695859909058,
4858
+ "learning_rate": 0.00019999999945113965,
4859
+ "loss": 0.0318,
4860
+ "step": 677
4861
+ },
4862
+ {
4863
+ "epoch": 0.012846018302734042,
4864
+ "grad_norm": 0.24829640984535217,
4865
+ "learning_rate": 0.00019999999944949267,
4866
+ "loss": 0.0497,
4867
+ "step": 678
4868
+ },
4869
+ {
4870
+ "epoch": 0.012864965232384092,
4871
+ "grad_norm": 0.26685649156570435,
4872
+ "learning_rate": 0.0001999999994478432,
4873
+ "loss": 0.053,
4874
+ "step": 679
4875
+ },
4876
+ {
4877
+ "epoch": 0.012883912162034142,
4878
+ "grad_norm": 0.21460634469985962,
4879
+ "learning_rate": 0.00019999999944619127,
4880
+ "loss": 0.0323,
4881
+ "step": 680
4882
+ },
4883
+ {
4884
+ "epoch": 0.012902859091684192,
4885
+ "grad_norm": 0.29949328303337097,
4886
+ "learning_rate": 0.0001999999994445369,
4887
+ "loss": 0.0359,
4888
+ "step": 681
4889
+ },
4890
+ {
4891
+ "epoch": 0.012921806021334243,
4892
+ "grad_norm": 0.22662204504013062,
4893
+ "learning_rate": 0.00019999999944288,
4894
+ "loss": 0.0463,
4895
+ "step": 682
4896
+ },
4897
+ {
4898
+ "epoch": 0.012940752950984293,
4899
+ "grad_norm": 0.164027601480484,
4900
+ "learning_rate": 0.0001999999994412207,
4901
+ "loss": 0.0387,
4902
+ "step": 683
4903
+ },
4904
+ {
4905
+ "epoch": 0.012959699880634343,
4906
+ "grad_norm": 0.12347234785556793,
4907
+ "learning_rate": 0.00019999999943955888,
4908
+ "loss": 0.0301,
4909
+ "step": 684
4910
+ },
4911
+ {
4912
+ "epoch": 0.012978646810284393,
4913
+ "grad_norm": 0.35990050435066223,
4914
+ "learning_rate": 0.00019999999943789463,
4915
+ "loss": 0.0402,
4916
+ "step": 685
4917
+ },
4918
+ {
4919
+ "epoch": 0.012997593739934444,
4920
+ "grad_norm": 0.16936954855918884,
4921
+ "learning_rate": 0.00019999999943622792,
4922
+ "loss": 0.0449,
4923
+ "step": 686
4924
+ },
4925
+ {
4926
+ "epoch": 0.013016540669584494,
4927
+ "grad_norm": 0.2638076841831207,
4928
+ "learning_rate": 0.0001999999994345587,
4929
+ "loss": 0.0564,
4930
+ "step": 687
4931
+ },
4932
+ {
4933
+ "epoch": 0.013035487599234544,
4934
+ "grad_norm": 0.2737369239330292,
4935
+ "learning_rate": 0.00019999999943288703,
4936
+ "loss": 0.0477,
4937
+ "step": 688
4938
+ },
4939
+ {
4940
+ "epoch": 0.013054434528884594,
4941
+ "grad_norm": 0.14391621947288513,
4942
+ "learning_rate": 0.0001999999994312129,
4943
+ "loss": 0.0374,
4944
+ "step": 689
4945
+ },
4946
+ {
4947
+ "epoch": 0.013073381458534645,
4948
+ "grad_norm": 0.049424514174461365,
4949
+ "learning_rate": 0.00019999999942953628,
4950
+ "loss": 0.021,
4951
+ "step": 690
4952
+ },
4953
+ {
4954
+ "epoch": 0.013092328388184695,
4955
+ "grad_norm": 0.22283080220222473,
4956
+ "learning_rate": 0.00019999999942785723,
4957
+ "loss": 0.045,
4958
+ "step": 691
4959
+ },
4960
+ {
4961
+ "epoch": 0.013111275317834745,
4962
+ "grad_norm": 0.15707112848758698,
4963
+ "learning_rate": 0.00019999999942617568,
4964
+ "loss": 0.0307,
4965
+ "step": 692
4966
+ },
4967
+ {
4968
+ "epoch": 0.013130222247484795,
4969
+ "grad_norm": 0.18966948986053467,
4970
+ "learning_rate": 0.0001999999994244917,
4971
+ "loss": 0.045,
4972
+ "step": 693
4973
+ },
4974
+ {
4975
+ "epoch": 0.013149169177134845,
4976
+ "grad_norm": 0.06199893355369568,
4977
+ "learning_rate": 0.00019999999942280522,
4978
+ "loss": 0.0229,
4979
+ "step": 694
4980
+ },
4981
+ {
4982
+ "epoch": 0.013168116106784896,
4983
+ "grad_norm": 0.10518538951873779,
4984
+ "learning_rate": 0.00019999999942111628,
4985
+ "loss": 0.0334,
4986
+ "step": 695
4987
+ },
4988
+ {
4989
+ "epoch": 0.013187063036434946,
4990
+ "grad_norm": 0.40504929423332214,
4991
+ "learning_rate": 0.0001999999994194249,
4992
+ "loss": 0.0544,
4993
+ "step": 696
4994
+ },
4995
+ {
4996
+ "epoch": 0.013206009966084996,
4997
+ "grad_norm": 0.06708107143640518,
4998
+ "learning_rate": 0.00019999999941773102,
4999
+ "loss": 0.0235,
5000
+ "step": 697
5001
+ },
5002
+ {
5003
+ "epoch": 0.013224956895735046,
5004
+ "grad_norm": 0.12683062255382538,
5005
+ "learning_rate": 0.00019999999941603468,
5006
+ "loss": 0.0275,
5007
+ "step": 698
5008
+ },
5009
+ {
5010
+ "epoch": 0.013243903825385097,
5011
+ "grad_norm": 0.08281129598617554,
5012
+ "learning_rate": 0.00019999999941433587,
5013
+ "loss": 0.0365,
5014
+ "step": 699
5015
+ },
5016
+ {
5017
+ "epoch": 0.013262850755035147,
5018
+ "grad_norm": 0.3280352056026459,
5019
+ "learning_rate": 0.0001999999994126346,
5020
+ "loss": 0.0484,
5021
+ "step": 700
5022
+ },
5023
+ {
5024
+ "epoch": 0.013262850755035147,
5025
+ "eval_loss": 0.008949960581958294,
5026
+ "eval_runtime": 553.8099,
5027
+ "eval_samples_per_second": 40.127,
5028
+ "eval_steps_per_second": 20.065,
5029
+ "step": 700
5030
  }
5031
  ],
5032
  "logging_steps": 1,
 
5046
  "attributes": {}
5047
  }
5048
  },
5049
+ "total_flos": 7.46484314406912e+16,
5050
  "train_batch_size": 2,
5051
  "trial_name": null,
5052
  "trial_params": null