ErrorAI commited on
Commit
8a8da10
·
verified ·
1 Parent(s): 318ded7

Training in progress, step 126, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e887fefb5da70cb425e53443f69458c72b5f5e2949808552cb3ae89d35ede2a8
3
  size 80792096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62bd1b1aeca5b48840634b417207c9c4c09169919bf06b8c8da0f375271cd0a
3
  size 80792096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffecc5ac466200e272ab04bfe33f734237a8968b9a035f8f94d38e9312822aa7
3
  size 41459700
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b8f03f3f0e8db3b09a38da8377783582d0876593c57b201c0069dccd794d5c
3
  size 41459700
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:543398ed728b58a3784cd7cd460f51a25d556e316e4503cde55f005d8cf00cb7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89875cf1fe47640e70f9e2ecabacd744805cbcd848b3213e20bfb7af6840fd49
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52d101ecb5410ac05183d1bfbe1110eb020b3ec6b94a7ed0ff839f0433ca3942
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baef09c38255daf871f5fc10bb6a87c467914d1dfa26b8771954f9499745efe8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5083207261724659,
5
  "eval_steps": 500,
6
- "global_step": 84,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -595,6 +595,300 @@
595
  "learning_rate": 5.146326588692438e-05,
596
  "loss": 1.1875,
597
  "step": 84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  }
599
  ],
600
  "logging_steps": 1,
@@ -614,7 +908,7 @@
614
  "attributes": {}
615
  }
616
  },
617
- "total_flos": 5.890109835785011e+16,
618
  "train_batch_size": 4,
619
  "trial_name": null,
620
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.762481089258699,
5
  "eval_steps": 500,
6
+ "global_step": 126,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
595
  "learning_rate": 5.146326588692438e-05,
596
  "loss": 1.1875,
597
  "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.5143721633888049,
601
+ "grad_norm": 9.615294456481934,
602
+ "learning_rate": 5.048781720696291e-05,
603
+ "loss": 1.5413,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.5204236006051437,
608
+ "grad_norm": 8.135547637939453,
609
+ "learning_rate": 4.95121827930371e-05,
610
+ "loss": 1.5035,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.5264750378214826,
615
+ "grad_norm": 5.731124401092529,
616
+ "learning_rate": 4.853673411307564e-05,
617
+ "loss": 1.2925,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.5325264750378215,
622
+ "grad_norm": 5.864335060119629,
623
+ "learning_rate": 4.756184256428992e-05,
624
+ "loss": 0.8643,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.5385779122541604,
629
+ "grad_norm": 6.120126247406006,
630
+ "learning_rate": 4.658787933176646e-05,
631
+ "loss": 0.8852,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.5446293494704992,
636
+ "grad_norm": 3.3239099979400635,
637
+ "learning_rate": 4.561521524713997e-05,
638
+ "loss": 0.6142,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.5506807866868382,
643
+ "grad_norm": 3.291146993637085,
644
+ "learning_rate": 4.4644220647401136e-05,
645
+ "loss": 0.6233,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.556732223903177,
650
+ "grad_norm": 2.9767818450927734,
651
+ "learning_rate": 4.367526523389253e-05,
652
+ "loss": 0.4502,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.5627836611195158,
657
+ "grad_norm": 2.5687639713287354,
658
+ "learning_rate": 4.2708717931546825e-05,
659
+ "loss": 0.4639,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.5688350983358548,
664
+ "grad_norm": 2.8956291675567627,
665
+ "learning_rate": 4.174494674842038e-05,
666
+ "loss": 0.3833,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.5748865355521936,
671
+ "grad_norm": 3.41869854927063,
672
+ "learning_rate": 4.0784318635576055e-05,
673
+ "loss": 0.3458,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.5809379727685325,
678
+ "grad_norm": 2.012956142425537,
679
+ "learning_rate": 3.982719934736832e-05,
680
+ "loss": 0.2622,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.5869894099848714,
685
+ "grad_norm": 1.7718323469161987,
686
+ "learning_rate": 3.887395330218429e-05,
687
+ "loss": 0.1466,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.5930408472012103,
692
+ "grad_norm": 1.5787506103515625,
693
+ "learning_rate": 3.792494344369311e-05,
694
+ "loss": 0.1388,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.5990922844175491,
699
+ "grad_norm": 1.3233414888381958,
700
+ "learning_rate": 3.698053110265699e-05,
701
+ "loss": 0.117,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.6051437216338881,
706
+ "grad_norm": 1.1028014421463013,
707
+ "learning_rate": 3.604107585935638e-05,
708
+ "loss": 0.12,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.6111951588502269,
713
+ "grad_norm": 1.057910442352295,
714
+ "learning_rate": 3.510693540668151e-05,
715
+ "loss": 0.0996,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.6172465960665658,
720
+ "grad_norm": 0.8907783031463623,
721
+ "learning_rate": 3.4178465413942625e-05,
722
+ "loss": 0.0811,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.6232980332829047,
727
+ "grad_norm": 0.7459267377853394,
728
+ "learning_rate": 3.325601939145069e-05,
729
+ "loss": 0.0353,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.6293494704992436,
734
+ "grad_norm": 0.9089831113815308,
735
+ "learning_rate": 3.23399485559201e-05,
736
+ "loss": 0.0709,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.6354009077155824,
741
+ "grad_norm": 0.7159168124198914,
742
+ "learning_rate": 3.143060169674468e-05,
743
+ "loss": 0.0451,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.6414523449319214,
748
+ "grad_norm": 0.5289613008499146,
749
+ "learning_rate": 3.0528325043197785e-05,
750
+ "loss": 0.0278,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.6475037821482602,
755
+ "grad_norm": 0.5118010640144348,
756
+ "learning_rate": 2.963346213260737e-05,
757
+ "loss": 0.0221,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.653555219364599,
762
+ "grad_norm": 0.354955792427063,
763
+ "learning_rate": 2.874635367955579e-05,
764
+ "loss": 0.0104,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.659606656580938,
769
+ "grad_norm": 1.1922907829284668,
770
+ "learning_rate": 2.7867337446154396e-05,
771
+ "loss": 0.0275,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.6656580937972768,
776
+ "grad_norm": 0.9284811615943909,
777
+ "learning_rate": 2.6996748113442394e-05,
778
+ "loss": 0.0236,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.6717095310136157,
783
+ "grad_norm": 0.6479511857032776,
784
+ "learning_rate": 2.613491715395861e-05,
785
+ "loss": 0.0328,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.6777609682299546,
790
+ "grad_norm": 0.3264722526073456,
791
+ "learning_rate": 2.5282172705535013e-05,
792
+ "loss": 0.0126,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.6838124054462935,
797
+ "grad_norm": 0.5275784134864807,
798
+ "learning_rate": 2.4438839446359933e-05,
799
+ "loss": 0.0267,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.6898638426626323,
804
+ "grad_norm": 1.2883987426757812,
805
+ "learning_rate": 2.360523847135838e-05,
806
+ "loss": 0.0482,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.6959152798789713,
811
+ "grad_norm": 0.6634830832481384,
812
+ "learning_rate": 2.2781687169936795e-05,
813
+ "loss": 0.0287,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.7019667170953101,
818
+ "grad_norm": 0.8741732239723206,
819
+ "learning_rate": 2.196849910513858e-05,
820
+ "loss": 0.037,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.708018154311649,
825
+ "grad_norm": 0.7417043447494507,
826
+ "learning_rate": 2.1165983894256647e-05,
827
+ "loss": 0.0244,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.7140695915279879,
832
+ "grad_norm": 1.1073297262191772,
833
+ "learning_rate": 2.037444709094804e-05,
834
+ "loss": 0.0391,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.7201210287443268,
839
+ "grad_norm": 0.9330146908760071,
840
+ "learning_rate": 1.9594190068895968e-05,
841
+ "loss": 0.0312,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.7261724659606656,
846
+ "grad_norm": 0.8450430631637573,
847
+ "learning_rate": 1.8825509907063327e-05,
848
+ "loss": 0.0212,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.7322239031770046,
853
+ "grad_norm": 0.6666033864021301,
854
+ "learning_rate": 1.8068699276581285e-05,
855
+ "loss": 0.0202,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.7382753403933434,
860
+ "grad_norm": 0.8927512168884277,
861
+ "learning_rate": 1.732404632931625e-05,
862
+ "loss": 0.0204,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.7443267776096822,
867
+ "grad_norm": 0.919458270072937,
868
+ "learning_rate": 1.6591834588157523e-05,
869
+ "loss": 0.0185,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.7503782148260212,
874
+ "grad_norm": 1.465542197227478,
875
+ "learning_rate": 1.5872342839067306e-05,
876
+ "loss": 0.5584,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.75642965204236,
881
+ "grad_norm": 1.944527506828308,
882
+ "learning_rate": 1.5165845024934366e-05,
883
+ "loss": 0.7492,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.762481089258699,
888
+ "grad_norm": 2.006526470184326,
889
+ "learning_rate": 1.447261014127167e-05,
890
+ "loss": 0.725,
891
+ "step": 126
892
  }
893
  ],
894
  "logging_steps": 1,
 
908
  "attributes": {}
909
  }
910
  },
911
+ "total_flos": 8.835164753677517e+16,
912
  "train_batch_size": 4,
913
  "trial_name": null,
914
  "trial_params": null