error577 commited on
Commit
87eca35
·
verified ·
1 Parent(s): 9a51863

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7032c700062a12e7d557b23e74cfadfc91a0420a0ae4a3b2af217f1222d5b973
3
  size 578859568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:024de3f8e2afb4251670fd08dca4001b5a22b5aa81ae8963df0898d4eaf258bf
3
  size 578859568
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e4dd87342bbdc488b7be2102740593ad560124616d462b5e22f0d4e4dc37a98
3
  size 294324692
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc5bb928545579b4a25c09020cb0ddd259af2200a0a70f135813471914ab7ec
3
  size 294324692
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:675d87ddb0df920fcc76dfa60cec09c41e3d4f94fc027859559749f5c5664296
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:404ce932a6e24aba5cc2e7fbd9a324e1334ed0f859ca94dba9e1b8e1bea61d54
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cca9c47b2bb879349fb9e7d9ad8f1da04c3b167e659c3d39ce46064118df0eea
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fe5dbedd3d0105f98d40d84fbe544af591501f8969d82c59cef4d7bb5f81712
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7205991148948669,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-350",
4
- "epoch": 0.009728910848430519,
5
  "eval_steps": 50,
6
- "global_step": 350,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2521,6 +2521,364 @@
2521
  "eval_samples_per_second": 11.514,
2522
  "eval_steps_per_second": 5.767,
2523
  "step": 350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2524
  }
2525
  ],
2526
  "logging_steps": 1,
@@ -2549,7 +2907,7 @@
2549
  "attributes": {}
2550
  }
2551
  },
2552
- "total_flos": 2.248730763853824e+16,
2553
  "train_batch_size": 2,
2554
  "trial_name": null,
2555
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7006093859672546,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-400",
4
+ "epoch": 0.011118755255349163,
5
  "eval_steps": 50,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2521
  "eval_samples_per_second": 11.514,
2522
  "eval_steps_per_second": 5.767,
2523
  "step": 350
2524
+ },
2525
+ {
2526
+ "epoch": 0.00975670773656889,
2527
+ "grad_norm": 0.1047213226556778,
2528
+ "learning_rate": 0.0004226366711111808,
2529
+ "loss": 0.942,
2530
+ "step": 351
2531
+ },
2532
+ {
2533
+ "epoch": 0.009784504624707265,
2534
+ "grad_norm": 0.10103016346693039,
2535
+ "learning_rate": 0.00041741373548960395,
2536
+ "loss": 0.7983,
2537
+ "step": 352
2538
+ },
2539
+ {
2540
+ "epoch": 0.009812301512845636,
2541
+ "grad_norm": 0.10744207352399826,
2542
+ "learning_rate": 0.00041221474770752696,
2543
+ "loss": 1.0167,
2544
+ "step": 353
2545
+ },
2546
+ {
2547
+ "epoch": 0.00984009840098401,
2548
+ "grad_norm": 0.10127032548189163,
2549
+ "learning_rate": 0.0004070399214749743,
2550
+ "loss": 0.9075,
2551
+ "step": 354
2552
+ },
2553
+ {
2554
+ "epoch": 0.009867895289122382,
2555
+ "grad_norm": 0.10049542039632797,
2556
+ "learning_rate": 0.000401889469508784,
2557
+ "loss": 0.9626,
2558
+ "step": 355
2559
+ },
2560
+ {
2561
+ "epoch": 0.009895692177260756,
2562
+ "grad_norm": 0.08796097338199615,
2563
+ "learning_rate": 0.00039676360352386354,
2564
+ "loss": 0.7321,
2565
+ "step": 356
2566
+ },
2567
+ {
2568
+ "epoch": 0.009923489065399128,
2569
+ "grad_norm": 0.07982050627470016,
2570
+ "learning_rate": 0.00039166253422448684,
2571
+ "loss": 0.6331,
2572
+ "step": 357
2573
+ },
2574
+ {
2575
+ "epoch": 0.009951285953537501,
2576
+ "grad_norm": 0.08986510336399078,
2577
+ "learning_rate": 0.0003865864712956336,
2578
+ "loss": 0.646,
2579
+ "step": 358
2580
+ },
2581
+ {
2582
+ "epoch": 0.009979082841675875,
2583
+ "grad_norm": 0.0990855023264885,
2584
+ "learning_rate": 0.00038153562339436853,
2585
+ "loss": 0.7897,
2586
+ "step": 359
2587
+ },
2588
+ {
2589
+ "epoch": 0.010006879729814247,
2590
+ "grad_norm": 0.09031044691801071,
2591
+ "learning_rate": 0.0003765101981412665,
2592
+ "loss": 0.7884,
2593
+ "step": 360
2594
+ },
2595
+ {
2596
+ "epoch": 0.01003467661795262,
2597
+ "grad_norm": 0.08221515268087387,
2598
+ "learning_rate": 0.0003715104021118764,
2599
+ "loss": 0.787,
2600
+ "step": 361
2601
+ },
2602
+ {
2603
+ "epoch": 0.010062473506090993,
2604
+ "grad_norm": 0.09221215546131134,
2605
+ "learning_rate": 0.00036653644082823046,
2606
+ "loss": 0.7991,
2607
+ "step": 362
2608
+ },
2609
+ {
2610
+ "epoch": 0.010090270394229366,
2611
+ "grad_norm": 0.08920681476593018,
2612
+ "learning_rate": 0.00036158851875039456,
2613
+ "loss": 0.9703,
2614
+ "step": 363
2615
+ },
2616
+ {
2617
+ "epoch": 0.010118067282367738,
2618
+ "grad_norm": 0.09488007426261902,
2619
+ "learning_rate": 0.0003566668392680662,
2620
+ "loss": 0.759,
2621
+ "step": 364
2622
+ },
2623
+ {
2624
+ "epoch": 0.010145864170506112,
2625
+ "grad_norm": 0.0824998989701271,
2626
+ "learning_rate": 0.0003517716046922118,
2627
+ "loss": 0.6603,
2628
+ "step": 365
2629
+ },
2630
+ {
2631
+ "epoch": 0.010173661058644486,
2632
+ "grad_norm": 0.0937090590596199,
2633
+ "learning_rate": 0.00034690301624675125,
2634
+ "loss": 0.7799,
2635
+ "step": 366
2636
+ },
2637
+ {
2638
+ "epoch": 0.010201457946782858,
2639
+ "grad_norm": 0.08511339873075485,
2640
+ "learning_rate": 0.00034206127406028743,
2641
+ "loss": 0.6858,
2642
+ "step": 367
2643
+ },
2644
+ {
2645
+ "epoch": 0.010229254834921231,
2646
+ "grad_norm": 0.09240502119064331,
2647
+ "learning_rate": 0.0003372465771578771,
2648
+ "loss": 0.8179,
2649
+ "step": 368
2650
+ },
2651
+ {
2652
+ "epoch": 0.010257051723059603,
2653
+ "grad_norm": 0.08915253728628159,
2654
+ "learning_rate": 0.000332459123452852,
2655
+ "loss": 0.7399,
2656
+ "step": 369
2657
+ },
2658
+ {
2659
+ "epoch": 0.010284848611197977,
2660
+ "grad_norm": 0.09466725587844849,
2661
+ "learning_rate": 0.00032769910973868313,
2662
+ "loss": 0.7574,
2663
+ "step": 370
2664
+ },
2665
+ {
2666
+ "epoch": 0.010312645499336349,
2667
+ "grad_norm": 0.09638349711894989,
2668
+ "learning_rate": 0.00032296673168089073,
2669
+ "loss": 0.7101,
2670
+ "step": 371
2671
+ },
2672
+ {
2673
+ "epoch": 0.010340442387474722,
2674
+ "grad_norm": 0.09288477897644043,
2675
+ "learning_rate": 0.0003182621838090006,
2676
+ "loss": 0.7876,
2677
+ "step": 372
2678
+ },
2679
+ {
2680
+ "epoch": 0.010368239275613094,
2681
+ "grad_norm": 0.08711199462413788,
2682
+ "learning_rate": 0.0003135856595085498,
2683
+ "loss": 0.548,
2684
+ "step": 373
2685
+ },
2686
+ {
2687
+ "epoch": 0.010396036163751468,
2688
+ "grad_norm": 0.09743466973304749,
2689
+ "learning_rate": 0.00030893735101313535,
2690
+ "loss": 0.7647,
2691
+ "step": 374
2692
+ },
2693
+ {
2694
+ "epoch": 0.010423833051889842,
2695
+ "grad_norm": 0.10079675167798996,
2696
+ "learning_rate": 0.0003043174493965136,
2697
+ "loss": 0.6033,
2698
+ "step": 375
2699
+ },
2700
+ {
2701
+ "epoch": 0.010451629940028214,
2702
+ "grad_norm": 0.11834803968667984,
2703
+ "learning_rate": 0.0002997261445647453,
2704
+ "loss": 0.7904,
2705
+ "step": 376
2706
+ },
2707
+ {
2708
+ "epoch": 0.010479426828166587,
2709
+ "grad_norm": 0.10599831491708755,
2710
+ "learning_rate": 0.00029516362524838847,
2711
+ "loss": 0.6515,
2712
+ "step": 377
2713
+ },
2714
+ {
2715
+ "epoch": 0.01050722371630496,
2716
+ "grad_norm": 0.09899824112653732,
2717
+ "learning_rate": 0.0002906300789947421,
2718
+ "loss": 0.5683,
2719
+ "step": 378
2720
+ },
2721
+ {
2722
+ "epoch": 0.010535020604443333,
2723
+ "grad_norm": 0.09238414466381073,
2724
+ "learning_rate": 0.00028612569216013674,
2725
+ "loss": 0.5779,
2726
+ "step": 379
2727
+ },
2728
+ {
2729
+ "epoch": 0.010562817492581705,
2730
+ "grad_norm": 0.0977087989449501,
2731
+ "learning_rate": 0.0002816506499022725,
2732
+ "loss": 0.562,
2733
+ "step": 380
2734
+ },
2735
+ {
2736
+ "epoch": 0.010590614380720079,
2737
+ "grad_norm": 0.09863676875829697,
2738
+ "learning_rate": 0.00027720513617260855,
2739
+ "loss": 0.6458,
2740
+ "step": 381
2741
+ },
2742
+ {
2743
+ "epoch": 0.01061841126885845,
2744
+ "grad_norm": 0.11672049015760422,
2745
+ "learning_rate": 0.0002727893337088027,
2746
+ "loss": 0.7436,
2747
+ "step": 382
2748
+ },
2749
+ {
2750
+ "epoch": 0.010646208156996824,
2751
+ "grad_norm": 0.10594779998064041,
2752
+ "learning_rate": 0.0002684034240271986,
2753
+ "loss": 0.6286,
2754
+ "step": 383
2755
+ },
2756
+ {
2757
+ "epoch": 0.010674005045135198,
2758
+ "grad_norm": 0.11194431781768799,
2759
+ "learning_rate": 0.00026404758741536505,
2760
+ "loss": 0.6866,
2761
+ "step": 384
2762
+ },
2763
+ {
2764
+ "epoch": 0.01070180193327357,
2765
+ "grad_norm": 0.10711811482906342,
2766
+ "learning_rate": 0.00025972200292468463,
2767
+ "loss": 0.5329,
2768
+ "step": 385
2769
+ },
2770
+ {
2771
+ "epoch": 0.010729598821411944,
2772
+ "grad_norm": 0.112278513610363,
2773
+ "learning_rate": 0.00025542684836299314,
2774
+ "loss": 0.6123,
2775
+ "step": 386
2776
+ },
2777
+ {
2778
+ "epoch": 0.010757395709550316,
2779
+ "grad_norm": 0.12985379993915558,
2780
+ "learning_rate": 0.0002511623002872718,
2781
+ "loss": 0.686,
2782
+ "step": 387
2783
+ },
2784
+ {
2785
+ "epoch": 0.01078519259768869,
2786
+ "grad_norm": 0.12807150185108185,
2787
+ "learning_rate": 0.00024692853399638914,
2788
+ "loss": 0.7307,
2789
+ "step": 388
2790
+ },
2791
+ {
2792
+ "epoch": 0.010812989485827061,
2793
+ "grad_norm": 0.11589968949556351,
2794
+ "learning_rate": 0.00024272572352389488,
2795
+ "loss": 0.7484,
2796
+ "step": 389
2797
+ },
2798
+ {
2799
+ "epoch": 0.010840786373965435,
2800
+ "grad_norm": 0.11197637766599655,
2801
+ "learning_rate": 0.0002385540416308656,
2802
+ "loss": 0.645,
2803
+ "step": 390
2804
+ },
2805
+ {
2806
+ "epoch": 0.010868583262103807,
2807
+ "grad_norm": 0.11974731087684631,
2808
+ "learning_rate": 0.00023441365979880524,
2809
+ "loss": 0.6447,
2810
+ "step": 391
2811
+ },
2812
+ {
2813
+ "epoch": 0.01089638015024218,
2814
+ "grad_norm": 0.11931908875703812,
2815
+ "learning_rate": 0.00023030474822259396,
2816
+ "loss": 0.5268,
2817
+ "step": 392
2818
+ },
2819
+ {
2820
+ "epoch": 0.010924177038380554,
2821
+ "grad_norm": 0.1177850067615509,
2822
+ "learning_rate": 0.0002262274758034931,
2823
+ "loss": 0.5544,
2824
+ "step": 393
2825
+ },
2826
+ {
2827
+ "epoch": 0.010951973926518926,
2828
+ "grad_norm": 0.12234242260456085,
2829
+ "learning_rate": 0.00022218201014220264,
2830
+ "loss": 0.6351,
2831
+ "step": 394
2832
+ },
2833
+ {
2834
+ "epoch": 0.0109797708146573,
2835
+ "grad_norm": 0.12924128770828247,
2836
+ "learning_rate": 0.0002181685175319702,
2837
+ "loss": 0.6468,
2838
+ "step": 395
2839
+ },
2840
+ {
2841
+ "epoch": 0.011007567702795672,
2842
+ "grad_norm": 0.1454528272151947,
2843
+ "learning_rate": 0.00021418716295175765,
2844
+ "loss": 0.6534,
2845
+ "step": 396
2846
+ },
2847
+ {
2848
+ "epoch": 0.011035364590934045,
2849
+ "grad_norm": 0.13180844485759735,
2850
+ "learning_rate": 0.0002102381100594577,
2851
+ "loss": 0.5698,
2852
+ "step": 397
2853
+ },
2854
+ {
2855
+ "epoch": 0.011063161479072417,
2856
+ "grad_norm": 0.1514425277709961,
2857
+ "learning_rate": 0.00020632152118516778,
2858
+ "loss": 0.6062,
2859
+ "step": 398
2860
+ },
2861
+ {
2862
+ "epoch": 0.011090958367210791,
2863
+ "grad_norm": 0.1729186773300171,
2864
+ "learning_rate": 0.00020243755732451564,
2865
+ "loss": 0.6178,
2866
+ "step": 399
2867
+ },
2868
+ {
2869
+ "epoch": 0.011118755255349163,
2870
+ "grad_norm": 0.2056732028722763,
2871
+ "learning_rate": 0.0001985863781320435,
2872
+ "loss": 0.7743,
2873
+ "step": 400
2874
+ },
2875
+ {
2876
+ "epoch": 0.011118755255349163,
2877
+ "eval_loss": 0.7006093859672546,
2878
+ "eval_runtime": 49.996,
2879
+ "eval_samples_per_second": 11.541,
2880
+ "eval_steps_per_second": 5.78,
2881
+ "step": 400
2882
  }
2883
  ],
2884
  "logging_steps": 1,
 
2907
  "attributes": {}
2908
  }
2909
  },
2910
+ "total_flos": 2.5658179794763776e+16,
2911
  "train_batch_size": 2,
2912
  "trial_name": null,
2913
  "trial_params": null