ErrorAI commited on
Commit
f7b79d5
·
verified ·
1 Parent(s): a952afa

Training in progress, step 106, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5efad5f8f7e73808adb4355755f63cdb4ca63c891afb97a37a0046cde9f2bfa9
3
  size 36981072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2592144216914132e86bd5a601f8e758f85749ba90940e5be447bc46137bc2
3
  size 36981072
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffca021fedf1be20659dc77c0fac5bd056cccbd6a74f7b6df8cb0e3dcef3c8e5
3
  size 19859140
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb4b95fed23122522581814dfc54cd9ba36a4a88b61c364a46622c0ffed4d78
3
  size 19859140
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7790bb7587e7ee8050375877046f8390ced5c325d75022f539b338b155a51255
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7ee2ca041eb973281be8bb6e98884779e97cdbdae58cb160ddd1e05697d7a6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc189d7074d9d9ea46bbfec5663cb05d9af3d44275e88f14253210fcc01de2f9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c620d55c80c663393c9d19b56755001f0197cc6593e37fbd3941c05d6429b0d4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2535885167464115,
5
  "eval_steps": 500,
6
- "global_step": 53,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -378,6 +378,377 @@
378
  "learning_rate": 8.695044586103296e-05,
379
  "loss": 1.5142,
380
  "step": 53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  }
382
  ],
383
  "logging_steps": 1,
@@ -397,7 +768,7 @@
397
  "attributes": {}
398
  }
399
  },
400
- "total_flos": 7102131861454848.0,
401
  "train_batch_size": 4,
402
  "trial_name": null,
403
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.507177033492823,
5
  "eval_steps": 500,
6
+ "global_step": 106,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
378
  "learning_rate": 8.695044586103296e-05,
379
  "loss": 1.5142,
380
  "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.2583732057416268,
384
+ "grad_norm": 0.40443918108940125,
385
+ "learning_rate": 8.642734045558952e-05,
386
+ "loss": 1.5673,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.2631578947368421,
391
+ "grad_norm": 0.39546626806259155,
392
+ "learning_rate": 8.58955961532221e-05,
393
+ "loss": 1.6391,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.2679425837320574,
398
+ "grad_norm": 0.3606320321559906,
399
+ "learning_rate": 8.535533905932738e-05,
400
+ "loss": 1.605,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.2727272727272727,
405
+ "grad_norm": 0.35251346230506897,
406
+ "learning_rate": 8.480669729814635e-05,
407
+ "loss": 1.3353,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.27751196172248804,
412
+ "grad_norm": 0.3432234227657318,
413
+ "learning_rate": 8.424980098237903e-05,
414
+ "loss": 1.59,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.2822966507177033,
419
+ "grad_norm": 0.3411220908164978,
420
+ "learning_rate": 8.368478218232787e-05,
421
+ "loss": 1.5685,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.28708133971291866,
426
+ "grad_norm": 0.2974628210067749,
427
+ "learning_rate": 8.311177489457652e-05,
428
+ "loss": 1.6432,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.291866028708134,
433
+ "grad_norm": 0.32686975598335266,
434
+ "learning_rate": 8.25309150102121e-05,
435
+ "loss": 1.6631,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.2966507177033493,
440
+ "grad_norm": 0.28340205550193787,
441
+ "learning_rate": 8.194234028259806e-05,
442
+ "loss": 1.4545,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.3014354066985646,
447
+ "grad_norm": 0.2994033098220825,
448
+ "learning_rate": 8.134619029470534e-05,
449
+ "loss": 1.5692,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.3062200956937799,
454
+ "grad_norm": 0.31368395686149597,
455
+ "learning_rate": 8.074260642600964e-05,
456
+ "loss": 1.4963,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.31100478468899523,
461
+ "grad_norm": 0.298289030790329,
462
+ "learning_rate": 8.013173181896283e-05,
463
+ "loss": 1.499,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.3157894736842105,
468
+ "grad_norm": 0.30118173360824585,
469
+ "learning_rate": 7.951371134504599e-05,
470
+ "loss": 1.3931,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.32057416267942584,
475
+ "grad_norm": 0.3297058045864105,
476
+ "learning_rate": 7.888869157041257e-05,
477
+ "loss": 1.6554,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.3253588516746411,
482
+ "grad_norm": 0.3186734616756439,
483
+ "learning_rate": 7.82568207211296e-05,
484
+ "loss": 1.4705,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.33014354066985646,
489
+ "grad_norm": 0.3129718601703644,
490
+ "learning_rate": 7.76182486480253e-05,
491
+ "loss": 1.4966,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.3349282296650718,
496
+ "grad_norm": 0.31505700945854187,
497
+ "learning_rate": 7.697312679115125e-05,
498
+ "loss": 1.5135,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.3397129186602871,
503
+ "grad_norm": 0.3435811996459961,
504
+ "learning_rate": 7.63216081438678e-05,
505
+ "loss": 1.4485,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.3444976076555024,
510
+ "grad_norm": 0.3319874703884125,
511
+ "learning_rate": 7.566384721656104e-05,
512
+ "loss": 1.5681,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.3492822966507177,
517
+ "grad_norm": 0.3534000813961029,
518
+ "learning_rate": 7.500000000000001e-05,
519
+ "loss": 1.5403,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.35406698564593303,
524
+ "grad_norm": 0.33617714047431946,
525
+ "learning_rate": 7.433022392834282e-05,
526
+ "loss": 1.4595,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.3588516746411483,
531
+ "grad_norm": 0.3658270537853241,
532
+ "learning_rate": 7.365467784180051e-05,
533
+ "loss": 1.6116,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.36363636363636365,
538
+ "grad_norm": 0.3672615885734558,
539
+ "learning_rate": 7.297352194896739e-05,
540
+ "loss": 1.5343,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.3684210526315789,
545
+ "grad_norm": 0.4313615560531616,
546
+ "learning_rate": 7.228691778882693e-05,
547
+ "loss": 1.6585,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.37320574162679426,
552
+ "grad_norm": 0.3559150695800781,
553
+ "learning_rate": 7.159502819244206e-05,
554
+ "loss": 1.5485,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.37799043062200954,
559
+ "grad_norm": 0.44446811079978943,
560
+ "learning_rate": 7.089801724433917e-05,
561
+ "loss": 1.8183,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.3827751196172249,
566
+ "grad_norm": 0.4031125009059906,
567
+ "learning_rate": 7.019605024359474e-05,
568
+ "loss": 1.7408,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.3875598086124402,
573
+ "grad_norm": 0.4458540678024292,
574
+ "learning_rate": 6.948929366463396e-05,
575
+ "loss": 1.8305,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.3923444976076555,
580
+ "grad_norm": 0.47091686725616455,
581
+ "learning_rate": 6.877791511775063e-05,
582
+ "loss": 1.6138,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.39712918660287083,
587
+ "grad_norm": 0.46907365322113037,
588
+ "learning_rate": 6.806208330935766e-05,
589
+ "loss": 1.7399,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.4019138755980861,
594
+ "grad_norm": 0.5174430012702942,
595
+ "learning_rate": 6.734196800197762e-05,
596
+ "loss": 1.8187,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.40669856459330145,
601
+ "grad_norm": 0.5630610585212708,
602
+ "learning_rate": 6.661773997398298e-05,
603
+ "loss": 2.0938,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.41148325358851673,
608
+ "grad_norm": 0.49577152729034424,
609
+ "learning_rate": 6.588957097909508e-05,
610
+ "loss": 1.8769,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.41626794258373206,
615
+ "grad_norm": 0.5209452509880066,
616
+ "learning_rate": 6.515763370565218e-05,
617
+ "loss": 1.8037,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.42105263157894735,
622
+ "grad_norm": 0.5780702829360962,
623
+ "learning_rate": 6.442210173565561e-05,
624
+ "loss": 2.012,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.4258373205741627,
629
+ "grad_norm": 0.5638837814331055,
630
+ "learning_rate": 6.368314950360415e-05,
631
+ "loss": 1.9684,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.430622009569378,
636
+ "grad_norm": 0.6348680257797241,
637
+ "learning_rate": 6.294095225512603e-05,
638
+ "loss": 1.7803,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.4354066985645933,
643
+ "grad_norm": 0.6502403616905212,
644
+ "learning_rate": 6.219568600541886e-05,
645
+ "loss": 1.8694,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.44019138755980863,
650
+ "grad_norm": 0.7062990665435791,
651
+ "learning_rate": 6.14475274975067e-05,
652
+ "loss": 2.005,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.4449760765550239,
657
+ "grad_norm": 0.7276941537857056,
658
+ "learning_rate": 6.069665416032487e-05,
659
+ "loss": 2.0282,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.44976076555023925,
664
+ "grad_norm": 0.8595307469367981,
665
+ "learning_rate": 5.9943244066641834e-05,
666
+ "loss": 2.1893,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.45454545454545453,
671
+ "grad_norm": 0.9870045781135559,
672
+ "learning_rate": 5.918747589082853e-05,
673
+ "loss": 2.1732,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.45933014354066987,
678
+ "grad_norm": 1.0758846998214722,
679
+ "learning_rate": 5.842952886648496e-05,
680
+ "loss": 2.1482,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.46411483253588515,
685
+ "grad_norm": 1.308895468711853,
686
+ "learning_rate": 5.7669582743934284e-05,
687
+ "loss": 2.0192,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.4688995215311005,
692
+ "grad_norm": 1.2809703350067139,
693
+ "learning_rate": 5.6907817747594116e-05,
694
+ "loss": 1.5592,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.47368421052631576,
699
+ "grad_norm": 2.426992893218994,
700
+ "learning_rate": 5.614441453323571e-05,
701
+ "loss": 2.0581,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.4784688995215311,
706
+ "grad_norm": 4.334367752075195,
707
+ "learning_rate": 5.5379554145140574e-05,
708
+ "loss": 2.1415,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.48325358851674644,
713
+ "grad_norm": 0.22294622659683228,
714
+ "learning_rate": 5.4613417973165106e-05,
715
+ "loss": 1.477,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.4880382775119617,
720
+ "grad_norm": 0.23113781213760376,
721
+ "learning_rate": 5.38461877097232e-05,
722
+ "loss": 1.3895,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.49282296650717705,
727
+ "grad_norm": 0.2687288522720337,
728
+ "learning_rate": 5.307804530669716e-05,
729
+ "loss": 1.5935,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.49760765550239233,
734
+ "grad_norm": 0.3015460968017578,
735
+ "learning_rate": 5.230917293228699e-05,
736
+ "loss": 1.7237,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.5023923444976076,
741
+ "grad_norm": 0.31039535999298096,
742
+ "learning_rate": 5.153975292780853e-05,
743
+ "loss": 1.4133,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.507177033492823,
748
+ "grad_norm": 0.2957954406738281,
749
+ "learning_rate": 5.0769967764450345e-05,
750
+ "loss": 1.4559,
751
+ "step": 106
752
  }
753
  ],
754
  "logging_steps": 1,
 
768
  "attributes": {}
769
  }
770
  },
771
+ "total_flos": 1.4139404071206912e+16,
772
  "train_batch_size": 4,
773
  "trial_name": null,
774
  "trial_params": null