fats-fme commited on
Commit
723cbe7
·
verified ·
1 Parent(s): 5f5ca4a

Training in progress, step 152, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24ada2f250fa0d9edb2dcc52814d9f59f6d4b9ba5325ecfd7d9e1bdea433b827
3
  size 200068512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47bdc1f52e9687f1b2065118ba5ed67f0e7ddf452b628d16653fe52dd7f3c5a
3
  size 200068512
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85091c047e66f773d03e75430fa11253adf7c04c48202c876bfc8f47f5de415c
3
  size 400361770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3c67139bafbd6e7779c67dd433ab463e343894e27d0d229c7eb363ec450984e
3
  size 400361770
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca38112adfe6a323d5a1c59e7e51101957b14585fe735c539e8cc8bd51a5c3cb
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502476aa04a593c704ac84967b6f43b1590a9e6c3672e589c240c6f9d4b929a7
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0febd423b29b1ae6ab8f49a5c0c261ae53be29b6f83168fad5cf7d1356b2650
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71dbabef39979f13d0c4a4aefe6ca255e0c18d780788adac8fb329d89525cd7a
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adae00deb0c1057a8fffd51118c3e03f2b7e37c29fe1e99873db394f6ac98449
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a18cea3621f9dc2789cea415c39b96c4e45945922514433b7b11735bf5d8256
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.04962455109369899,
5
  "eval_steps": 76,
6
- "global_step": 76,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -555,6 +555,546 @@
555
  "eval_samples_per_second": 3.664,
556
  "eval_steps_per_second": 0.916,
557
  "step": 76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  }
559
  ],
560
  "logging_steps": 1,
@@ -574,7 +1114,7 @@
574
  "attributes": {}
575
  }
576
  },
577
- "total_flos": 2.331358757084201e+17,
578
  "train_batch_size": 2,
579
  "trial_name": null,
580
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.09924910218739798,
5
  "eval_steps": 76,
6
+ "global_step": 152,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
555
  "eval_samples_per_second": 3.664,
556
  "eval_steps_per_second": 0.916,
557
  "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.05027750571335292,
561
+ "grad_norm": 3.9176759719848633,
562
+ "learning_rate": 0.0001944321908543708,
563
+ "loss": 1.9556,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.050930460333006855,
568
+ "grad_norm": 3.578507423400879,
569
+ "learning_rate": 0.00019401636137990816,
570
+ "loss": 2.025,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.05158341495266079,
575
+ "grad_norm": 3.641873598098755,
576
+ "learning_rate": 0.00019358603562568416,
577
+ "loss": 1.667,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.05223636957231472,
582
+ "grad_norm": 4.218084335327148,
583
+ "learning_rate": 0.0001931412799431554,
584
+ "loss": 1.797,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.05288932419196866,
589
+ "grad_norm": 3.496741533279419,
590
+ "learning_rate": 0.0001926821629087133,
591
+ "loss": 1.5076,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.053542278811622594,
596
+ "grad_norm": 5.606319427490234,
597
+ "learning_rate": 0.00019220875531311045,
598
+ "loss": 1.5796,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.054195233431276524,
603
+ "grad_norm": 3.143007516860962,
604
+ "learning_rate": 0.00019172113015054532,
605
+ "loss": 1.4222,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.05484818805093046,
610
+ "grad_norm": 3.359457015991211,
611
+ "learning_rate": 0.00019121936260740752,
612
+ "loss": 0.9389,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.0555011426705844,
617
+ "grad_norm": 3.911376476287842,
618
+ "learning_rate": 0.00019070353005068484,
619
+ "loss": 1.8963,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.056154097290238326,
624
+ "grad_norm": 3.542954444885254,
625
+ "learning_rate": 0.00019017371201603407,
626
+ "loss": 1.4677,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.05680705190989226,
631
+ "grad_norm": 3.1694722175598145,
632
+ "learning_rate": 0.00018962999019551754,
633
+ "loss": 1.4803,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.0574600065295462,
638
+ "grad_norm": 2.960282564163208,
639
+ "learning_rate": 0.00018907244842500704,
640
+ "loss": 1.8923,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.05811296114920013,
645
+ "grad_norm": 2.6101479530334473,
646
+ "learning_rate": 0.00018850117267125738,
647
+ "loss": 1.9243,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.058765915768854066,
652
+ "grad_norm": 3.0619993209838867,
653
+ "learning_rate": 0.00018791625101865117,
654
+ "loss": 2.1384,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.059418870388507995,
659
+ "grad_norm": 2.6776371002197266,
660
+ "learning_rate": 0.0001873177736556172,
661
+ "loss": 1.7285,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.06007182500816193,
666
+ "grad_norm": 3.687798023223877,
667
+ "learning_rate": 0.00018670583286072443,
668
+ "loss": 1.8332,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.06072477962781587,
673
+ "grad_norm": 2.632847547531128,
674
+ "learning_rate": 0.0001860805229884536,
675
+ "loss": 1.8342,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.0613777342474698,
680
+ "grad_norm": 3.5173568725585938,
681
+ "learning_rate": 0.00018544194045464886,
682
+ "loss": 2.004,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.062030688867123734,
687
+ "grad_norm": 3.2944045066833496,
688
+ "learning_rate": 0.0001847901837216515,
689
+ "loss": 1.8861,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.06268364348677767,
694
+ "grad_norm": 3.5300235748291016,
695
+ "learning_rate": 0.00018412535328311814,
696
+ "loss": 1.7608,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.06333659810643161,
701
+ "grad_norm": 3.253826856613159,
702
+ "learning_rate": 0.0001834475516485257,
703
+ "loss": 1.9151,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.06398955272608553,
708
+ "grad_norm": 3.243023633956909,
709
+ "learning_rate": 0.00018275688332736577,
710
+ "loss": 1.5671,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.06464250734573947,
715
+ "grad_norm": 3.3818089962005615,
716
+ "learning_rate": 0.00018205345481302998,
717
+ "loss": 1.4077,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.0652954619653934,
722
+ "grad_norm": 3.632511615753174,
723
+ "learning_rate": 0.00018133737456639044,
724
+ "loss": 1.0454,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.06594841658504734,
729
+ "grad_norm": 3.6621978282928467,
730
+ "learning_rate": 0.0001806087529990758,
731
+ "loss": 2.0844,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.06660137120470128,
736
+ "grad_norm": 5.99480676651001,
737
+ "learning_rate": 0.0001798677024564473,
738
+ "loss": 1.8015,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.06725432582435521,
743
+ "grad_norm": 3.063887357711792,
744
+ "learning_rate": 0.00017911433720027624,
745
+ "loss": 1.7182,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.06790728044400914,
750
+ "grad_norm": 3.2303333282470703,
751
+ "learning_rate": 0.00017834877339112612,
752
+ "loss": 1.6701,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.06856023506366307,
757
+ "grad_norm": 7.370791435241699,
758
+ "learning_rate": 0.000177571129070442,
759
+ "loss": 1.6819,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.06921318968331701,
764
+ "grad_norm": 3.4059948921203613,
765
+ "learning_rate": 0.00017678152414234968,
766
+ "loss": 1.3683,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.06986614430297095,
771
+ "grad_norm": 4.130568504333496,
772
+ "learning_rate": 0.000175980080355168,
773
+ "loss": 1.6074,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.07051909892262488,
778
+ "grad_norm": 4.288647651672363,
779
+ "learning_rate": 0.00017516692128263648,
780
+ "loss": 1.2521,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.07117205354227882,
785
+ "grad_norm": 3.245211362838745,
786
+ "learning_rate": 0.00017434217230486164,
787
+ "loss": 1.3333,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.07182500816193274,
792
+ "grad_norm": 3.7068018913269043,
793
+ "learning_rate": 0.00017350596058898483,
794
+ "loss": 1.287,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.07247796278158668,
799
+ "grad_norm": 3.418928623199463,
800
+ "learning_rate": 0.0001726584150695744,
801
+ "loss": 1.3896,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.07313091740124061,
806
+ "grad_norm": 3.3947291374206543,
807
+ "learning_rate": 0.00017179966642874543,
808
+ "loss": 1.7948,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.07378387202089455,
813
+ "grad_norm": 2.770167589187622,
814
+ "learning_rate": 0.0001709298470760101,
815
+ "loss": 1.8008,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.07443682664054849,
820
+ "grad_norm": 2.841723918914795,
821
+ "learning_rate": 0.00017004909112786144,
822
+ "loss": 1.7773,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.07508978126020241,
827
+ "grad_norm": 3.010446071624756,
828
+ "learning_rate": 0.00016915753438709417,
829
+ "loss": 1.9485,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.07574273587985635,
834
+ "grad_norm": 2.804893970489502,
835
+ "learning_rate": 0.00016825531432186543,
836
+ "loss": 1.8283,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.07639569049951028,
841
+ "grad_norm": 2.607825517654419,
842
+ "learning_rate": 0.00016734257004449862,
843
+ "loss": 1.5881,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.07704864511916422,
848
+ "grad_norm": 3.1926357746124268,
849
+ "learning_rate": 0.00016641944229003395,
850
+ "loss": 1.9909,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.07770159973881816,
855
+ "grad_norm": 2.6762239933013916,
856
+ "learning_rate": 0.00016548607339452853,
857
+ "loss": 1.7493,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.07835455435847209,
862
+ "grad_norm": 2.9282002449035645,
863
+ "learning_rate": 0.00016454260727310978,
864
+ "loss": 1.6987,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.07900750897812601,
869
+ "grad_norm": 3.6605136394500732,
870
+ "learning_rate": 0.00016358918939778536,
871
+ "loss": 2.1916,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.07966046359777995,
876
+ "grad_norm": 3.031012535095215,
877
+ "learning_rate": 0.00016262596677501297,
878
+ "loss": 1.9056,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.08031341821743389,
883
+ "grad_norm": 3.2578601837158203,
884
+ "learning_rate": 0.0001616530879230335,
885
+ "loss": 1.5707,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.08096637283708782,
890
+ "grad_norm": 3.2448766231536865,
891
+ "learning_rate": 0.00016067070284897137,
892
+ "loss": 1.43,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.08161932745674176,
897
+ "grad_norm": 3.2631771564483643,
898
+ "learning_rate": 0.00015967896302570485,
899
+ "loss": 1.0851,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.08227228207639568,
904
+ "grad_norm": 3.316664457321167,
905
+ "learning_rate": 0.0001586780213685108,
906
+ "loss": 1.9519,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.08292523669604962,
911
+ "grad_norm": 2.7955403327941895,
912
+ "learning_rate": 0.00015766803221148673,
913
+ "loss": 1.6003,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.08357819131570356,
918
+ "grad_norm": 3.2884178161621094,
919
+ "learning_rate": 0.0001566491512837543,
920
+ "loss": 1.7108,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.0842311459353575,
925
+ "grad_norm": 3.472278356552124,
926
+ "learning_rate": 0.00015562153568544752,
927
+ "loss": 1.8399,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.08488410055501143,
932
+ "grad_norm": 2.900644302368164,
933
+ "learning_rate": 0.00015458534386348966,
934
+ "loss": 1.6259,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.08553705517466537,
939
+ "grad_norm": 3.018883228302002,
940
+ "learning_rate": 0.0001535407355871626,
941
+ "loss": 1.5246,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.08619000979431929,
946
+ "grad_norm": 3.375364303588867,
947
+ "learning_rate": 0.00015248787192347196,
948
+ "loss": 1.6209,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.08684296441397323,
953
+ "grad_norm": 2.9012420177459717,
954
+ "learning_rate": 0.00015142691521231267,
955
+ "loss": 1.2602,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.08749591903362716,
960
+ "grad_norm": 4.421230316162109,
961
+ "learning_rate": 0.00015035802904143762,
962
+ "loss": 1.5445,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.0881488736532811,
967
+ "grad_norm": 3.3842790126800537,
968
+ "learning_rate": 0.00014928137822123452,
969
+ "loss": 1.2163,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.08880182827293504,
974
+ "grad_norm": 3.1379024982452393,
975
+ "learning_rate": 0.0001481971287593138,
976
+ "loss": 1.6229,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.08945478289258897,
981
+ "grad_norm": 3.0401148796081543,
982
+ "learning_rate": 0.00014710544783491208,
983
+ "loss": 2.1325,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.0901077375122429,
988
+ "grad_norm": 3.1100378036499023,
989
+ "learning_rate": 0.00014600650377311522,
990
+ "loss": 1.8436,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.09076069213189683,
995
+ "grad_norm": 2.646742105484009,
996
+ "learning_rate": 0.00014490046601890405,
997
+ "loss": 2.0225,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.09141364675155077,
1002
+ "grad_norm": 2.7213666439056396,
1003
+ "learning_rate": 0.00014378750511102826,
1004
+ "loss": 1.9278,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.0920666013712047,
1009
+ "grad_norm": 2.6186165809631348,
1010
+ "learning_rate": 0.00014266779265571087,
1011
+ "loss": 2.1003,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.09271955599085864,
1016
+ "grad_norm": 2.469266653060913,
1017
+ "learning_rate": 0.00014154150130018866,
1018
+ "loss": 2.1259,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.09337251061051256,
1023
+ "grad_norm": 2.95566725730896,
1024
+ "learning_rate": 0.00014040880470609187,
1025
+ "loss": 2.0982,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.0940254652301665,
1030
+ "grad_norm": 3.408046007156372,
1031
+ "learning_rate": 0.00013926987752266735,
1032
+ "loss": 2.1553,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.09467841984982044,
1037
+ "grad_norm": 2.5797595977783203,
1038
+ "learning_rate": 0.00013812489535984981,
1039
+ "loss": 1.9652,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.09533137446947437,
1044
+ "grad_norm": 3.9302403926849365,
1045
+ "learning_rate": 0.00013697403476118454,
1046
+ "loss": 1.916,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.09598432908912831,
1051
+ "grad_norm": 3.0103251934051514,
1052
+ "learning_rate": 0.0001358174731766064,
1053
+ "loss": 1.5778,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.09663728370878225,
1058
+ "grad_norm": 3.0252418518066406,
1059
+ "learning_rate": 0.00013465538893507907,
1060
+ "loss": 1.862,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.09729023832843617,
1065
+ "grad_norm": 3.1504366397857666,
1066
+ "learning_rate": 0.00013348796121709862,
1067
+ "loss": 1.7159,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.0979431929480901,
1072
+ "grad_norm": 2.718940019607544,
1073
+ "learning_rate": 0.00013231537002706594,
1074
+ "loss": 1.1477,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.09859614756774404,
1079
+ "grad_norm": 3.6509456634521484,
1080
+ "learning_rate": 0.0001311377961655319,
1081
+ "loss": 1.6706,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.09924910218739798,
1086
+ "grad_norm": 3.2815134525299072,
1087
+ "learning_rate": 0.00012995542120132017,
1088
+ "loss": 1.4804,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.09924910218739798,
1093
+ "eval_loss": NaN,
1094
+ "eval_runtime": 699.3675,
1095
+ "eval_samples_per_second": 3.689,
1096
+ "eval_steps_per_second": 0.922,
1097
+ "step": 152
1098
  }
1099
  ],
1100
  "logging_steps": 1,
 
1114
  "attributes": {}
1115
  }
1116
  },
1117
+ "total_flos": 4.662717514168402e+17,
1118
  "train_batch_size": 2,
1119
  "trial_name": null,
1120
  "trial_params": null