Sara Price commited on
Commit
432db53
·
verified ·
1 Parent(s): ee885da

Training in progress, step 2800, checkpoint

Browse files
last-checkpoint/model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd1af5785dab727dd568f182a5a7d20a4d19d85f9c5061fcb4eee97e78e41f19
3
  size 4840658560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3effd21deff73b875cbbbcb7b68b90bbfe8903a0e51be8058c53758b4b69e8
3
  size 4840658560
last-checkpoint/model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55b4db8857b8d16de1d15b0ad8fda306ab20929d9845f18cb5e94ed506ca617e
3
  size 4857206856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e75145539a74937cbcb14393098e0dc3e8f08f26d13afd817a896a7eb3079013
3
  size 4857206856
last-checkpoint/model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:527d7d91d8b4744c114e7ed7f2de910a06b292399133d52b7a6df3c0d43419f3
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6930d75a718de0acda4d53d225f373f929a57fdfb8bfe746b26f86f2e57e68f7
3
  size 4857206904
last-checkpoint/model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18fa49b80bdbd4d6cf5d1689c1b1b32d416a6c02263e93e4035781720562e8d3
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba3cac452013d3ad33956d1041393fc3156668df0db3bcea8982e4f2071f1d33
3
  size 4857206904
last-checkpoint/model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36a75564599468267099c3aae3afef58090fbb57fbbbda643c82dc57f5da6286
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2c2b2d4b0276c78e66a79fde63bf7cbecab3654e3af8cfa4967412e8bdb0839
3
  size 4857206904
last-checkpoint/model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c744ac3b48f54b66e6e755b9024915c114b92851010d1882a3cf043cb8f0896c
3
  size 2684734256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0629ef624962c09361086545ce0d1e1cbce3b48499e371e8107304328d85e5d
3
  size 2684734256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.2,
5
  "eval_steps": 50,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -487,6 +487,366 @@
487
  "eval_samples_per_second": 69.062,
488
  "eval_steps_per_second": 3.542,
489
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  }
491
  ],
492
  "logging_steps": 50,
@@ -494,7 +854,7 @@
494
  "num_input_tokens_seen": 0,
495
  "num_train_epochs": 7,
496
  "save_steps": 400,
497
- "total_flos": 7.727857024158925e+16,
498
  "train_batch_size": 4,
499
  "trial_name": null,
500
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.6,
5
  "eval_steps": 50,
6
+ "global_step": 2800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
487
  "eval_samples_per_second": 69.062,
488
  "eval_steps_per_second": 3.542,
489
  "step": 1600
490
+ },
491
+ {
492
+ "epoch": 3.3,
493
+ "grad_norm": 1.1109460592269897,
494
+ "learning_rate": 1.2708404681430054e-05,
495
+ "loss": 0.1496,
496
+ "step": 1650
497
+ },
498
+ {
499
+ "epoch": 3.3,
500
+ "eval_loss": 0.8655369281768799,
501
+ "eval_runtime": 4.4935,
502
+ "eval_samples_per_second": 34.717,
503
+ "eval_steps_per_second": 1.78,
504
+ "step": 1650
505
+ },
506
+ {
507
+ "epoch": 3.4,
508
+ "grad_norm": 1.0697747468948364,
509
+ "learning_rate": 1.2225209339563144e-05,
510
+ "loss": 0.1491,
511
+ "step": 1700
512
+ },
513
+ {
514
+ "epoch": 3.4,
515
+ "eval_loss": 0.8790720701217651,
516
+ "eval_runtime": 2.9231,
517
+ "eval_samples_per_second": 53.369,
518
+ "eval_steps_per_second": 2.737,
519
+ "step": 1700
520
+ },
521
+ {
522
+ "epoch": 3.5,
523
+ "grad_norm": 1.5176475048065186,
524
+ "learning_rate": 1.1736481776669307e-05,
525
+ "loss": 0.1475,
526
+ "step": 1750
527
+ },
528
+ {
529
+ "epoch": 3.5,
530
+ "eval_loss": 0.8695181608200073,
531
+ "eval_runtime": 2.8158,
532
+ "eval_samples_per_second": 55.401,
533
+ "eval_steps_per_second": 2.841,
534
+ "step": 1750
535
+ },
536
+ {
537
+ "epoch": 3.6,
538
+ "grad_norm": 2.0232934951782227,
539
+ "learning_rate": 1.1243437046474854e-05,
540
+ "loss": 0.1481,
541
+ "step": 1800
542
+ },
543
+ {
544
+ "epoch": 3.6,
545
+ "eval_loss": 0.8817379474639893,
546
+ "eval_runtime": 3.4544,
547
+ "eval_samples_per_second": 45.159,
548
+ "eval_steps_per_second": 2.316,
549
+ "step": 1800
550
+ },
551
+ {
552
+ "epoch": 3.7,
553
+ "grad_norm": 2.0155882835388184,
554
+ "learning_rate": 1.0747300935864245e-05,
555
+ "loss": 0.1514,
556
+ "step": 1850
557
+ },
558
+ {
559
+ "epoch": 3.7,
560
+ "eval_loss": 0.8860240578651428,
561
+ "eval_runtime": 2.2629,
562
+ "eval_samples_per_second": 68.939,
563
+ "eval_steps_per_second": 3.535,
564
+ "step": 1850
565
+ },
566
+ {
567
+ "epoch": 3.8,
568
+ "grad_norm": 1.2712018489837646,
569
+ "learning_rate": 1.0249306917380731e-05,
570
+ "loss": 0.1421,
571
+ "step": 1900
572
+ },
573
+ {
574
+ "epoch": 3.8,
575
+ "eval_loss": 0.8792973756790161,
576
+ "eval_runtime": 2.2679,
577
+ "eval_samples_per_second": 68.786,
578
+ "eval_steps_per_second": 3.528,
579
+ "step": 1900
580
+ },
581
+ {
582
+ "epoch": 3.9,
583
+ "grad_norm": 1.3542224168777466,
584
+ "learning_rate": 9.750693082619274e-06,
585
+ "loss": 0.1446,
586
+ "step": 1950
587
+ },
588
+ {
589
+ "epoch": 3.9,
590
+ "eval_loss": 0.8789901733398438,
591
+ "eval_runtime": 2.2568,
592
+ "eval_samples_per_second": 69.126,
593
+ "eval_steps_per_second": 3.545,
594
+ "step": 1950
595
+ },
596
+ {
597
+ "epoch": 4.0,
598
+ "grad_norm": 1.1753352880477905,
599
+ "learning_rate": 9.252699064135759e-06,
600
+ "loss": 0.1458,
601
+ "step": 2000
602
+ },
603
+ {
604
+ "epoch": 4.0,
605
+ "eval_loss": 0.8896375894546509,
606
+ "eval_runtime": 2.2586,
607
+ "eval_samples_per_second": 69.069,
608
+ "eval_steps_per_second": 3.542,
609
+ "step": 2000
610
+ },
611
+ {
612
+ "epoch": 4.1,
613
+ "grad_norm": 1.1664059162139893,
614
+ "learning_rate": 8.756562953525151e-06,
615
+ "loss": 0.0943,
616
+ "step": 2050
617
+ },
618
+ {
619
+ "epoch": 4.1,
620
+ "eval_loss": 0.9453464150428772,
621
+ "eval_runtime": 2.7602,
622
+ "eval_samples_per_second": 56.517,
623
+ "eval_steps_per_second": 2.898,
624
+ "step": 2050
625
+ },
626
+ {
627
+ "epoch": 4.2,
628
+ "grad_norm": 0.8652101755142212,
629
+ "learning_rate": 8.263518223330698e-06,
630
+ "loss": 0.0936,
631
+ "step": 2100
632
+ },
633
+ {
634
+ "epoch": 4.2,
635
+ "eval_loss": 0.9263865947723389,
636
+ "eval_runtime": 2.7019,
637
+ "eval_samples_per_second": 57.738,
638
+ "eval_steps_per_second": 2.961,
639
+ "step": 2100
640
+ },
641
+ {
642
+ "epoch": 4.3,
643
+ "grad_norm": 1.1194976568222046,
644
+ "learning_rate": 7.774790660436857e-06,
645
+ "loss": 0.0944,
646
+ "step": 2150
647
+ },
648
+ {
649
+ "epoch": 4.3,
650
+ "eval_loss": 0.9188054203987122,
651
+ "eval_runtime": 2.4797,
652
+ "eval_samples_per_second": 62.91,
653
+ "eval_steps_per_second": 3.226,
654
+ "step": 2150
655
+ },
656
+ {
657
+ "epoch": 4.4,
658
+ "grad_norm": 0.8499676585197449,
659
+ "learning_rate": 7.291595318569951e-06,
660
+ "loss": 0.0946,
661
+ "step": 2200
662
+ },
663
+ {
664
+ "epoch": 4.4,
665
+ "eval_loss": 0.9282132387161255,
666
+ "eval_runtime": 4.1021,
667
+ "eval_samples_per_second": 38.03,
668
+ "eval_steps_per_second": 1.95,
669
+ "step": 2200
670
+ },
671
+ {
672
+ "epoch": 4.5,
673
+ "grad_norm": 0.9869304299354553,
674
+ "learning_rate": 6.815133497483157e-06,
675
+ "loss": 0.0914,
676
+ "step": 2250
677
+ },
678
+ {
679
+ "epoch": 4.5,
680
+ "eval_loss": 0.9421446323394775,
681
+ "eval_runtime": 2.2639,
682
+ "eval_samples_per_second": 68.908,
683
+ "eval_steps_per_second": 3.534,
684
+ "step": 2250
685
+ },
686
+ {
687
+ "epoch": 4.6,
688
+ "grad_norm": 0.7122153043746948,
689
+ "learning_rate": 6.34658975633605e-06,
690
+ "loss": 0.0927,
691
+ "step": 2300
692
+ },
693
+ {
694
+ "epoch": 4.6,
695
+ "eval_loss": 0.9317699074745178,
696
+ "eval_runtime": 2.2762,
697
+ "eval_samples_per_second": 68.535,
698
+ "eval_steps_per_second": 3.515,
699
+ "step": 2300
700
+ },
701
+ {
702
+ "epoch": 4.7,
703
+ "grad_norm": 0.7879806756973267,
704
+ "learning_rate": 5.887128968693887e-06,
705
+ "loss": 0.0926,
706
+ "step": 2350
707
+ },
708
+ {
709
+ "epoch": 4.7,
710
+ "eval_loss": 0.9379280209541321,
711
+ "eval_runtime": 2.2587,
712
+ "eval_samples_per_second": 69.065,
713
+ "eval_steps_per_second": 3.542,
714
+ "step": 2350
715
+ },
716
+ {
717
+ "epoch": 4.8,
718
+ "grad_norm": 0.7589277625083923,
719
+ "learning_rate": 5.43789342646837e-06,
720
+ "loss": 0.096,
721
+ "step": 2400
722
+ },
723
+ {
724
+ "epoch": 4.8,
725
+ "eval_loss": 0.9322577118873596,
726
+ "eval_runtime": 2.2566,
727
+ "eval_samples_per_second": 69.13,
728
+ "eval_steps_per_second": 3.545,
729
+ "step": 2400
730
+ },
731
+ {
732
+ "epoch": 4.9,
733
+ "grad_norm": 1.010057806968689,
734
+ "learning_rate": 5.000000000000003e-06,
735
+ "loss": 0.093,
736
+ "step": 2450
737
+ },
738
+ {
739
+ "epoch": 4.9,
740
+ "eval_loss": 0.9507681131362915,
741
+ "eval_runtime": 2.3695,
742
+ "eval_samples_per_second": 65.835,
743
+ "eval_steps_per_second": 3.376,
744
+ "step": 2450
745
+ },
746
+ {
747
+ "epoch": 5.0,
748
+ "grad_norm": 0.8985171914100647,
749
+ "learning_rate": 4.5745373613424075e-06,
750
+ "loss": 0.0923,
751
+ "step": 2500
752
+ },
753
+ {
754
+ "epoch": 5.0,
755
+ "eval_loss": 0.9445481896400452,
756
+ "eval_runtime": 2.455,
757
+ "eval_samples_per_second": 63.545,
758
+ "eval_steps_per_second": 3.259,
759
+ "step": 2500
760
+ },
761
+ {
762
+ "epoch": 5.1,
763
+ "grad_norm": 0.7004356384277344,
764
+ "learning_rate": 4.162563277652104e-06,
765
+ "loss": 0.0534,
766
+ "step": 2550
767
+ },
768
+ {
769
+ "epoch": 5.1,
770
+ "eval_loss": 1.019740343093872,
771
+ "eval_runtime": 3.8097,
772
+ "eval_samples_per_second": 40.949,
773
+ "eval_steps_per_second": 2.1,
774
+ "step": 2550
775
+ },
776
+ {
777
+ "epoch": 5.2,
778
+ "grad_norm": 1.2868136167526245,
779
+ "learning_rate": 3.7651019814126656e-06,
780
+ "loss": 0.0552,
781
+ "step": 2600
782
+ },
783
+ {
784
+ "epoch": 5.2,
785
+ "eval_loss": 1.0226831436157227,
786
+ "eval_runtime": 3.5746,
787
+ "eval_samples_per_second": 43.641,
788
+ "eval_steps_per_second": 2.238,
789
+ "step": 2600
790
+ },
791
+ {
792
+ "epoch": 5.3,
793
+ "grad_norm": 1.0688791275024414,
794
+ "learning_rate": 3.3831416240314085e-06,
795
+ "loss": 0.0523,
796
+ "step": 2650
797
+ },
798
+ {
799
+ "epoch": 5.3,
800
+ "eval_loss": 1.0200960636138916,
801
+ "eval_runtime": 2.26,
802
+ "eval_samples_per_second": 69.026,
803
+ "eval_steps_per_second": 3.54,
804
+ "step": 2650
805
+ },
806
+ {
807
+ "epoch": 5.4,
808
+ "grad_norm": 0.46982139348983765,
809
+ "learning_rate": 3.017631819139273e-06,
810
+ "loss": 0.0534,
811
+ "step": 2700
812
+ },
813
+ {
814
+ "epoch": 5.4,
815
+ "eval_loss": 1.0137168169021606,
816
+ "eval_runtime": 2.2646,
817
+ "eval_samples_per_second": 68.885,
818
+ "eval_steps_per_second": 3.533,
819
+ "step": 2700
820
+ },
821
+ {
822
+ "epoch": 5.5,
823
+ "grad_norm": 0.4713106155395508,
824
+ "learning_rate": 2.669481281701739e-06,
825
+ "loss": 0.0541,
826
+ "step": 2750
827
+ },
828
+ {
829
+ "epoch": 5.5,
830
+ "eval_loss": 1.0238293409347534,
831
+ "eval_runtime": 2.2586,
832
+ "eval_samples_per_second": 69.068,
833
+ "eval_steps_per_second": 3.542,
834
+ "step": 2750
835
+ },
836
+ {
837
+ "epoch": 5.6,
838
+ "grad_norm": 0.5023716688156128,
839
+ "learning_rate": 2.339555568810221e-06,
840
+ "loss": 0.0518,
841
+ "step": 2800
842
+ },
843
+ {
844
+ "epoch": 5.6,
845
+ "eval_loss": 1.0314223766326904,
846
+ "eval_runtime": 2.2586,
847
+ "eval_samples_per_second": 69.069,
848
+ "eval_steps_per_second": 3.542,
849
+ "step": 2800
850
  }
851
  ],
852
  "logging_steps": 50,
 
854
  "num_input_tokens_seen": 0,
855
  "num_train_epochs": 7,
856
  "save_steps": 400,
857
+ "total_flos": 1.3524716052545536e+17,
858
  "train_batch_size": 4,
859
  "trial_name": null,
860
  "trial_params": null