jssky commited on
Commit
5b9fc2c
·
verified ·
1 Parent(s): 75464ae

Training in progress, step 186, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b77cad872e4bcc0e7430f7ae5e5dd49663565ee18e3d3f00e8e2a69232aaefe9
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7183e64a109a343469dca147a8c0f81c155762d2008823f579ca9d3c894683e
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07e5a97fc87827d4ac283ef085c485e13587e49c599f0bc37667e3ec5c9fd3b5
3
  size 41119636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05740b4cc52050b284c0a5ae3bb1a0c79e4e11d33baa793c0b7886b2990dd202
3
  size 41119636
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e43e7b805b054b95bdd6a42492a7a566a708443ec3c2c635b33190dba7252c59
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74d5b0d31a7bfc657111f3d8a8c89bd9f54c57945ce1f937d44749b81c417e07
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b893c9226071507bd706a0d6a7a997c6693067f1b6d62a3307e8595b10559486
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51627bede7e359d4449d36ee1f729a3d0065d65146d217ca0847d4a1da7e2115
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25016812373907193,
5
  "eval_steps": 93,
6
- "global_step": 93,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -666,6 +666,665 @@
666
  "eval_samples_per_second": 14.493,
667
  "eval_steps_per_second": 7.293,
668
  "step": 93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  }
670
  ],
671
  "logging_steps": 1,
@@ -685,7 +1344,7 @@
685
  "attributes": {}
686
  }
687
  },
688
- "total_flos": 3.029473590194995e+16,
689
  "train_batch_size": 2,
690
  "trial_name": null,
691
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5003362474781439,
5
  "eval_steps": 93,
6
+ "global_step": 186,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
666
  "eval_samples_per_second": 14.493,
667
  "eval_steps_per_second": 7.293,
668
  "step": 93
669
+ },
670
+ {
671
+ "epoch": 0.2528581035642233,
672
+ "grad_norm": 5.78995418548584,
673
+ "learning_rate": 0.00017458486592061704,
674
+ "loss": 0.9346,
675
+ "step": 94
676
+ },
677
+ {
678
+ "epoch": 0.25554808338937457,
679
+ "grad_norm": 3.517900228500366,
680
+ "learning_rate": 0.00017400398070435293,
681
+ "loss": 0.3506,
682
+ "step": 95
683
+ },
684
+ {
685
+ "epoch": 0.2582380632145259,
686
+ "grad_norm": 5.804417610168457,
687
+ "learning_rate": 0.00017341752189883983,
688
+ "loss": 0.4959,
689
+ "step": 96
690
+ },
691
+ {
692
+ "epoch": 0.2609280430396772,
693
+ "grad_norm": 8.148117065429688,
694
+ "learning_rate": 0.00017282553367305975,
695
+ "loss": 0.9842,
696
+ "step": 97
697
+ },
698
+ {
699
+ "epoch": 0.2636180228648285,
700
+ "grad_norm": 9.511378288269043,
701
+ "learning_rate": 0.0001722280606124415,
702
+ "loss": 0.7143,
703
+ "step": 98
704
+ },
705
+ {
706
+ "epoch": 0.26630800268997984,
707
+ "grad_norm": 6.079991340637207,
708
+ "learning_rate": 0.00017162514771550255,
709
+ "loss": 0.2979,
710
+ "step": 99
711
+ },
712
+ {
713
+ "epoch": 0.26899798251513113,
714
+ "grad_norm": 6.114333152770996,
715
+ "learning_rate": 0.00017101684039046036,
716
+ "loss": 0.5812,
717
+ "step": 100
718
+ },
719
+ {
720
+ "epoch": 0.2716879623402825,
721
+ "grad_norm": 4.91884183883667,
722
+ "learning_rate": 0.0001704031844518121,
723
+ "loss": 1.8317,
724
+ "step": 101
725
+ },
726
+ {
727
+ "epoch": 0.27437794216543376,
728
+ "grad_norm": 5.735188007354736,
729
+ "learning_rate": 0.0001697842261168843,
730
+ "loss": 2.3345,
731
+ "step": 102
732
+ },
733
+ {
734
+ "epoch": 0.27706792199058505,
735
+ "grad_norm": 5.317649841308594,
736
+ "learning_rate": 0.0001691600120023521,
737
+ "loss": 2.0851,
738
+ "step": 103
739
+ },
740
+ {
741
+ "epoch": 0.2797579018157364,
742
+ "grad_norm": 7.778799057006836,
743
+ "learning_rate": 0.00016853058912072802,
744
+ "loss": 1.1674,
745
+ "step": 104
746
+ },
747
+ {
748
+ "epoch": 0.2824478816408877,
749
+ "grad_norm": 4.196416854858398,
750
+ "learning_rate": 0.00016789600487682156,
751
+ "loss": 1.5939,
752
+ "step": 105
753
+ },
754
+ {
755
+ "epoch": 0.285137861466039,
756
+ "grad_norm": 4.3927741050720215,
757
+ "learning_rate": 0.0001672563070641688,
758
+ "loss": 1.4615,
759
+ "step": 106
760
+ },
761
+ {
762
+ "epoch": 0.2878278412911903,
763
+ "grad_norm": 4.284142017364502,
764
+ "learning_rate": 0.0001666115438614328,
765
+ "loss": 1.9508,
766
+ "step": 107
767
+ },
768
+ {
769
+ "epoch": 0.2905178211163416,
770
+ "grad_norm": 5.4508867263793945,
771
+ "learning_rate": 0.00016596176382877506,
772
+ "loss": 1.3256,
773
+ "step": 108
774
+ },
775
+ {
776
+ "epoch": 0.29320780094149296,
777
+ "grad_norm": 11.987678527832031,
778
+ "learning_rate": 0.00016530701590419824,
779
+ "loss": 0.9202,
780
+ "step": 109
781
+ },
782
+ {
783
+ "epoch": 0.29589778076664425,
784
+ "grad_norm": 5.667636394500732,
785
+ "learning_rate": 0.00016464734939986036,
786
+ "loss": 1.3247,
787
+ "step": 110
788
+ },
789
+ {
790
+ "epoch": 0.29858776059179554,
791
+ "grad_norm": 3.8087687492370605,
792
+ "learning_rate": 0.00016398281399836097,
793
+ "loss": 0.9626,
794
+ "step": 111
795
+ },
796
+ {
797
+ "epoch": 0.3012777404169469,
798
+ "grad_norm": 5.772204875946045,
799
+ "learning_rate": 0.00016331345974899923,
800
+ "loss": 1.3912,
801
+ "step": 112
802
+ },
803
+ {
804
+ "epoch": 0.30396772024209817,
805
+ "grad_norm": 3.2174160480499268,
806
+ "learning_rate": 0.00016263933706400451,
807
+ "loss": 1.0545,
808
+ "step": 113
809
+ },
810
+ {
811
+ "epoch": 0.3066577000672495,
812
+ "grad_norm": 3.539743423461914,
813
+ "learning_rate": 0.00016196049671473954,
814
+ "loss": 0.9489,
815
+ "step": 114
816
+ },
817
+ {
818
+ "epoch": 0.3093476798924008,
819
+ "grad_norm": 3.6935033798217773,
820
+ "learning_rate": 0.0001612769898278766,
821
+ "loss": 1.0005,
822
+ "step": 115
823
+ },
824
+ {
825
+ "epoch": 0.3120376597175521,
826
+ "grad_norm": 3.477961301803589,
827
+ "learning_rate": 0.00016058886788154712,
828
+ "loss": 0.6155,
829
+ "step": 116
830
+ },
831
+ {
832
+ "epoch": 0.31472763954270344,
833
+ "grad_norm": 3.9399242401123047,
834
+ "learning_rate": 0.00015989618270146423,
835
+ "loss": 0.7689,
836
+ "step": 117
837
+ },
838
+ {
839
+ "epoch": 0.31741761936785473,
840
+ "grad_norm": 4.4496846199035645,
841
+ "learning_rate": 0.0001591989864570199,
842
+ "loss": 1.0174,
843
+ "step": 118
844
+ },
845
+ {
846
+ "epoch": 0.3201075991930061,
847
+ "grad_norm": 4.519758224487305,
848
+ "learning_rate": 0.00015849733165735556,
849
+ "loss": 0.9051,
850
+ "step": 119
851
+ },
852
+ {
853
+ "epoch": 0.32279757901815737,
854
+ "grad_norm": 3.636235237121582,
855
+ "learning_rate": 0.00015779127114740757,
856
+ "loss": 0.5993,
857
+ "step": 120
858
+ },
859
+ {
860
+ "epoch": 0.32548755884330866,
861
+ "grad_norm": 2.2947537899017334,
862
+ "learning_rate": 0.0001570808581039271,
863
+ "loss": 0.23,
864
+ "step": 121
865
+ },
866
+ {
867
+ "epoch": 0.32817753866846,
868
+ "grad_norm": 3.0490782260894775,
869
+ "learning_rate": 0.00015636614603147512,
870
+ "loss": 0.5818,
871
+ "step": 122
872
+ },
873
+ {
874
+ "epoch": 0.3308675184936113,
875
+ "grad_norm": 3.2933220863342285,
876
+ "learning_rate": 0.0001556471887583929,
877
+ "loss": 0.6548,
878
+ "step": 123
879
+ },
880
+ {
881
+ "epoch": 0.33355749831876264,
882
+ "grad_norm": 4.488528251647949,
883
+ "learning_rate": 0.0001549240404327477,
884
+ "loss": 0.9628,
885
+ "step": 124
886
+ },
887
+ {
888
+ "epoch": 0.3362474781439139,
889
+ "grad_norm": 4.679425239562988,
890
+ "learning_rate": 0.00015419675551825475,
891
+ "loss": 0.4106,
892
+ "step": 125
893
+ },
894
+ {
895
+ "epoch": 0.3389374579690652,
896
+ "grad_norm": 4.400868892669678,
897
+ "learning_rate": 0.0001534653887901754,
898
+ "loss": 0.3852,
899
+ "step": 126
900
+ },
901
+ {
902
+ "epoch": 0.34162743779421656,
903
+ "grad_norm": 4.978918552398682,
904
+ "learning_rate": 0.00015272999533119162,
905
+ "loss": 0.8162,
906
+ "step": 127
907
+ },
908
+ {
909
+ "epoch": 0.34431741761936785,
910
+ "grad_norm": 5.046586990356445,
911
+ "learning_rate": 0.00015199063052725745,
912
+ "loss": 0.649,
913
+ "step": 128
914
+ },
915
+ {
916
+ "epoch": 0.34700739744451914,
917
+ "grad_norm": 7.412467956542969,
918
+ "learning_rate": 0.0001512473500634277,
919
+ "loss": 0.6579,
920
+ "step": 129
921
+ },
922
+ {
923
+ "epoch": 0.3496973772696705,
924
+ "grad_norm": 3.8262441158294678,
925
+ "learning_rate": 0.00015050020991966406,
926
+ "loss": 0.4359,
927
+ "step": 130
928
+ },
929
+ {
930
+ "epoch": 0.3523873570948218,
931
+ "grad_norm": 5.179169654846191,
932
+ "learning_rate": 0.0001497492663666189,
933
+ "loss": 0.5676,
934
+ "step": 131
935
+ },
936
+ {
937
+ "epoch": 0.3550773369199731,
938
+ "grad_norm": 5.74229097366333,
939
+ "learning_rate": 0.00014899457596139729,
940
+ "loss": 0.3635,
941
+ "step": 132
942
+ },
943
+ {
944
+ "epoch": 0.3577673167451244,
945
+ "grad_norm": 7.098540782928467,
946
+ "learning_rate": 0.00014823619554329745,
947
+ "loss": 0.996,
948
+ "step": 133
949
+ },
950
+ {
951
+ "epoch": 0.3604572965702757,
952
+ "grad_norm": 4.635382652282715,
953
+ "learning_rate": 0.00014747418222952995,
954
+ "loss": 0.7149,
955
+ "step": 134
956
+ },
957
+ {
958
+ "epoch": 0.36314727639542704,
959
+ "grad_norm": 3.750243663787842,
960
+ "learning_rate": 0.0001467085934109158,
961
+ "loss": 0.3169,
962
+ "step": 135
963
+ },
964
+ {
965
+ "epoch": 0.36583725622057833,
966
+ "grad_norm": 4.545015811920166,
967
+ "learning_rate": 0.00014593948674756417,
968
+ "loss": 0.5511,
969
+ "step": 136
970
+ },
971
+ {
972
+ "epoch": 0.3685272360457297,
973
+ "grad_norm": 5.990297794342041,
974
+ "learning_rate": 0.0001451669201645298,
975
+ "loss": 0.766,
976
+ "step": 137
977
+ },
978
+ {
979
+ "epoch": 0.37121721587088097,
980
+ "grad_norm": 3.692354679107666,
981
+ "learning_rate": 0.00014439095184745024,
982
+ "loss": 0.4151,
983
+ "step": 138
984
+ },
985
+ {
986
+ "epoch": 0.37390719569603226,
987
+ "grad_norm": 3.4247729778289795,
988
+ "learning_rate": 0.00014361164023816376,
989
+ "loss": 0.466,
990
+ "step": 139
991
+ },
992
+ {
993
+ "epoch": 0.3765971755211836,
994
+ "grad_norm": 3.962257146835327,
995
+ "learning_rate": 0.00014282904403030772,
996
+ "loss": 0.4263,
997
+ "step": 140
998
+ },
999
+ {
1000
+ "epoch": 0.3792871553463349,
1001
+ "grad_norm": 5.7197771072387695,
1002
+ "learning_rate": 0.00014204322216489814,
1003
+ "loss": 0.4988,
1004
+ "step": 141
1005
+ },
1006
+ {
1007
+ "epoch": 0.38197713517148624,
1008
+ "grad_norm": 5.587864398956299,
1009
+ "learning_rate": 0.00014125423382589048,
1010
+ "loss": 0.6946,
1011
+ "step": 142
1012
+ },
1013
+ {
1014
+ "epoch": 0.3846671149966375,
1015
+ "grad_norm": 11.981307029724121,
1016
+ "learning_rate": 0.00014046213843572236,
1017
+ "loss": 0.7456,
1018
+ "step": 143
1019
+ },
1020
+ {
1021
+ "epoch": 0.3873570948217888,
1022
+ "grad_norm": 6.747979164123535,
1023
+ "learning_rate": 0.00013966699565083802,
1024
+ "loss": 1.2804,
1025
+ "step": 144
1026
+ },
1027
+ {
1028
+ "epoch": 0.39004707464694016,
1029
+ "grad_norm": 4.663575649261475,
1030
+ "learning_rate": 0.0001388688653571954,
1031
+ "loss": 0.5548,
1032
+ "step": 145
1033
+ },
1034
+ {
1035
+ "epoch": 0.39273705447209145,
1036
+ "grad_norm": 5.274585247039795,
1037
+ "learning_rate": 0.00013806780766575588,
1038
+ "loss": 0.6681,
1039
+ "step": 146
1040
+ },
1041
+ {
1042
+ "epoch": 0.3954270342972428,
1043
+ "grad_norm": 13.038918495178223,
1044
+ "learning_rate": 0.00013726388290795697,
1045
+ "loss": 1.082,
1046
+ "step": 147
1047
+ },
1048
+ {
1049
+ "epoch": 0.3981170141223941,
1050
+ "grad_norm": 7.035642623901367,
1051
+ "learning_rate": 0.00013645715163116846,
1052
+ "loss": 0.3975,
1053
+ "step": 148
1054
+ },
1055
+ {
1056
+ "epoch": 0.4008069939475454,
1057
+ "grad_norm": 5.065128326416016,
1058
+ "learning_rate": 0.00013564767459413237,
1059
+ "loss": 0.2747,
1060
+ "step": 149
1061
+ },
1062
+ {
1063
+ "epoch": 0.4034969737726967,
1064
+ "grad_norm": 4.475830554962158,
1065
+ "learning_rate": 0.0001348355127623869,
1066
+ "loss": 0.2169,
1067
+ "step": 150
1068
+ },
1069
+ {
1070
+ "epoch": 0.406186953597848,
1071
+ "grad_norm": 4.0652031898498535,
1072
+ "learning_rate": 0.00013402072730367475,
1073
+ "loss": 1.7546,
1074
+ "step": 151
1075
+ },
1076
+ {
1077
+ "epoch": 0.4088769334229993,
1078
+ "grad_norm": 4.62870454788208,
1079
+ "learning_rate": 0.0001332033795833364,
1080
+ "loss": 1.5081,
1081
+ "step": 152
1082
+ },
1083
+ {
1084
+ "epoch": 0.41156691324815065,
1085
+ "grad_norm": 3.8758082389831543,
1086
+ "learning_rate": 0.0001323835311596884,
1087
+ "loss": 1.371,
1088
+ "step": 153
1089
+ },
1090
+ {
1091
+ "epoch": 0.41425689307330194,
1092
+ "grad_norm": 4.078228950500488,
1093
+ "learning_rate": 0.00013156124377938699,
1094
+ "loss": 1.5507,
1095
+ "step": 154
1096
+ },
1097
+ {
1098
+ "epoch": 0.4169468728984533,
1099
+ "grad_norm": 3.6525630950927734,
1100
+ "learning_rate": 0.0001307365793727778,
1101
+ "loss": 1.1093,
1102
+ "step": 155
1103
+ },
1104
+ {
1105
+ "epoch": 0.41963685272360457,
1106
+ "grad_norm": 4.3088202476501465,
1107
+ "learning_rate": 0.00012990960004923154,
1108
+ "loss": 1.6154,
1109
+ "step": 156
1110
+ },
1111
+ {
1112
+ "epoch": 0.42232683254875586,
1113
+ "grad_norm": 4.335425853729248,
1114
+ "learning_rate": 0.00012908036809246623,
1115
+ "loss": 1.4037,
1116
+ "step": 157
1117
+ },
1118
+ {
1119
+ "epoch": 0.4250168123739072,
1120
+ "grad_norm": 3.7850985527038574,
1121
+ "learning_rate": 0.00012824894595585637,
1122
+ "loss": 1.1471,
1123
+ "step": 158
1124
+ },
1125
+ {
1126
+ "epoch": 0.4277067921990585,
1127
+ "grad_norm": 4.085525035858154,
1128
+ "learning_rate": 0.00012741539625772918,
1129
+ "loss": 1.2586,
1130
+ "step": 159
1131
+ },
1132
+ {
1133
+ "epoch": 0.43039677202420984,
1134
+ "grad_norm": 3.4970481395721436,
1135
+ "learning_rate": 0.0001265797817766486,
1136
+ "loss": 1.1133,
1137
+ "step": 160
1138
+ },
1139
+ {
1140
+ "epoch": 0.43308675184936113,
1141
+ "grad_norm": 4.015367031097412,
1142
+ "learning_rate": 0.0001257421654466872,
1143
+ "loss": 0.71,
1144
+ "step": 161
1145
+ },
1146
+ {
1147
+ "epoch": 0.4357767316745124,
1148
+ "grad_norm": 3.805530071258545,
1149
+ "learning_rate": 0.00012490261035268612,
1150
+ "loss": 1.4369,
1151
+ "step": 162
1152
+ },
1153
+ {
1154
+ "epoch": 0.43846671149966376,
1155
+ "grad_norm": 4.442086696624756,
1156
+ "learning_rate": 0.00012406117972550414,
1157
+ "loss": 1.1577,
1158
+ "step": 163
1159
+ },
1160
+ {
1161
+ "epoch": 0.44115669132481505,
1162
+ "grad_norm": 3.171997308731079,
1163
+ "learning_rate": 0.00012321793693725509,
1164
+ "loss": 0.667,
1165
+ "step": 164
1166
+ },
1167
+ {
1168
+ "epoch": 0.4438466711499664,
1169
+ "grad_norm": 4.185075759887695,
1170
+ "learning_rate": 0.0001223729454965354,
1171
+ "loss": 0.7278,
1172
+ "step": 165
1173
+ },
1174
+ {
1175
+ "epoch": 0.4465366509751177,
1176
+ "grad_norm": 3.8975086212158203,
1177
+ "learning_rate": 0.00012152626904364067,
1178
+ "loss": 0.9939,
1179
+ "step": 166
1180
+ },
1181
+ {
1182
+ "epoch": 0.449226630800269,
1183
+ "grad_norm": 3.1474146842956543,
1184
+ "learning_rate": 0.00012067797134577275,
1185
+ "loss": 0.7392,
1186
+ "step": 167
1187
+ },
1188
+ {
1189
+ "epoch": 0.4519166106254203,
1190
+ "grad_norm": 3.5632522106170654,
1191
+ "learning_rate": 0.00011982811629223709,
1192
+ "loss": 0.8636,
1193
+ "step": 168
1194
+ },
1195
+ {
1196
+ "epoch": 0.4546065904505716,
1197
+ "grad_norm": 2.6525533199310303,
1198
+ "learning_rate": 0.00011897676788963101,
1199
+ "loss": 0.3818,
1200
+ "step": 169
1201
+ },
1202
+ {
1203
+ "epoch": 0.45729657027572296,
1204
+ "grad_norm": 3.889469861984253,
1205
+ "learning_rate": 0.0001181239902570229,
1206
+ "loss": 0.5985,
1207
+ "step": 170
1208
+ },
1209
+ {
1210
+ "epoch": 0.45998655010087425,
1211
+ "grad_norm": 3.6286370754241943,
1212
+ "learning_rate": 0.00011726984762112328,
1213
+ "loss": 0.8639,
1214
+ "step": 171
1215
+ },
1216
+ {
1217
+ "epoch": 0.46267652992602554,
1218
+ "grad_norm": 2.5282163619995117,
1219
+ "learning_rate": 0.0001164144043114475,
1220
+ "loss": 0.3303,
1221
+ "step": 172
1222
+ },
1223
+ {
1224
+ "epoch": 0.4653665097511769,
1225
+ "grad_norm": 4.00683069229126,
1226
+ "learning_rate": 0.00011555772475547084,
1227
+ "loss": 0.414,
1228
+ "step": 173
1229
+ },
1230
+ {
1231
+ "epoch": 0.46805648957632817,
1232
+ "grad_norm": 5.255921363830566,
1233
+ "learning_rate": 0.00011469987347377602,
1234
+ "loss": 0.8622,
1235
+ "step": 174
1236
+ },
1237
+ {
1238
+ "epoch": 0.47074646940147946,
1239
+ "grad_norm": 4.201918601989746,
1240
+ "learning_rate": 0.00011384091507519403,
1241
+ "loss": 0.8862,
1242
+ "step": 175
1243
+ },
1244
+ {
1245
+ "epoch": 0.4734364492266308,
1246
+ "grad_norm": 4.199880599975586,
1247
+ "learning_rate": 0.00011298091425193806,
1248
+ "loss": 0.4554,
1249
+ "step": 176
1250
+ },
1251
+ {
1252
+ "epoch": 0.4761264290517821,
1253
+ "grad_norm": 3.6669838428497314,
1254
+ "learning_rate": 0.00011211993577473121,
1255
+ "loss": 0.343,
1256
+ "step": 177
1257
+ },
1258
+ {
1259
+ "epoch": 0.47881640887693344,
1260
+ "grad_norm": 4.186169147491455,
1261
+ "learning_rate": 0.00011125804448792831,
1262
+ "loss": 0.8039,
1263
+ "step": 178
1264
+ },
1265
+ {
1266
+ "epoch": 0.48150638870208473,
1267
+ "grad_norm": 4.209519386291504,
1268
+ "learning_rate": 0.00011039530530463218,
1269
+ "loss": 0.3221,
1270
+ "step": 179
1271
+ },
1272
+ {
1273
+ "epoch": 0.484196368527236,
1274
+ "grad_norm": 2.875875234603882,
1275
+ "learning_rate": 0.00010953178320180475,
1276
+ "loss": 0.2874,
1277
+ "step": 180
1278
+ },
1279
+ {
1280
+ "epoch": 0.48688634835238737,
1281
+ "grad_norm": 4.24071741104126,
1282
+ "learning_rate": 0.00010866754321537338,
1283
+ "loss": 0.5502,
1284
+ "step": 181
1285
+ },
1286
+ {
1287
+ "epoch": 0.48957632817753866,
1288
+ "grad_norm": 4.230165481567383,
1289
+ "learning_rate": 0.0001078026504353325,
1290
+ "loss": 0.5396,
1291
+ "step": 182
1292
+ },
1293
+ {
1294
+ "epoch": 0.49226630800269,
1295
+ "grad_norm": 4.387772560119629,
1296
+ "learning_rate": 0.0001069371700008416,
1297
+ "loss": 0.5987,
1298
+ "step": 183
1299
+ },
1300
+ {
1301
+ "epoch": 0.4949562878278413,
1302
+ "grad_norm": 4.988356113433838,
1303
+ "learning_rate": 0.00010607116709531918,
1304
+ "loss": 0.6046,
1305
+ "step": 184
1306
+ },
1307
+ {
1308
+ "epoch": 0.4976462676529926,
1309
+ "grad_norm": 4.388515472412109,
1310
+ "learning_rate": 0.00010520470694153353,
1311
+ "loss": 0.595,
1312
+ "step": 185
1313
+ },
1314
+ {
1315
+ "epoch": 0.5003362474781439,
1316
+ "grad_norm": 4.310067653656006,
1317
+ "learning_rate": 0.00010433785479669038,
1318
+ "loss": 0.5557,
1319
+ "step": 186
1320
+ },
1321
+ {
1322
+ "epoch": 0.5003362474781439,
1323
+ "eval_loss": 0.8278390765190125,
1324
+ "eval_runtime": 10.698,
1325
+ "eval_samples_per_second": 14.676,
1326
+ "eval_steps_per_second": 7.385,
1327
+ "step": 186
1328
  }
1329
  ],
1330
  "logging_steps": 1,
 
1344
  "attributes": {}
1345
  }
1346
  },
1347
+ "total_flos": 6.05894718038999e+16,
1348
  "train_batch_size": 2,
1349
  "trial_name": null,
1350
  "trial_params": null