ttj commited on
Commit
c027895
·
verified ·
1 Parent(s): c479f7c

Add files using upload-large-folder tool

Browse files
Files changed (42) hide show
  1. .gitattributes +12 -0
  2. checkpoints/0000010000/.metadata +0 -0
  3. checkpoints/0000010000/__0_0.distcp +3 -0
  4. checkpoints/0000010000/__1_0.distcp +3 -0
  5. checkpoints/0000010000/__2_0.distcp +3 -0
  6. checkpoints/0000010000/__3_0.distcp +3 -0
  7. checkpoints/0000010000/params.json +1 -0
  8. checkpoints/0000010000/train_state_00000.json +1 -0
  9. checkpoints/0000010000/train_state_00001.json +1 -0
  10. checkpoints/0000010000/train_state_00002.json +1 -0
  11. checkpoints/0000010000/train_state_00003.json +1 -0
  12. checkpoints/0000012500/.metadata +0 -0
  13. checkpoints/0000012500/__0_0.distcp +3 -0
  14. checkpoints/0000012500/__1_0.distcp +3 -0
  15. checkpoints/0000012500/__2_0.distcp +3 -0
  16. checkpoints/0000012500/__3_0.distcp +3 -0
  17. checkpoints/0000012500/params.json +1 -0
  18. checkpoints/0000012500/train_state_00000.json +1 -0
  19. checkpoints/0000012500/train_state_00001.json +1 -0
  20. checkpoints/0000012500/train_state_00002.json +1 -0
  21. checkpoints/0000012500/train_state_00003.json +1 -0
  22. checkpoints/0000015000/.metadata +0 -0
  23. checkpoints/0000015000/__0_0.distcp +3 -0
  24. checkpoints/0000015000/__1_0.distcp +3 -0
  25. checkpoints/0000015000/__2_0.distcp +3 -0
  26. checkpoints/0000015000/__3_0.distcp +3 -0
  27. checkpoints/0000015000/params.json +1 -0
  28. checkpoints/0000015000/train_state_00000.json +1 -0
  29. checkpoints/0000015000/train_state_00001.json +1 -0
  30. checkpoints/0000015000/train_state_00002.json +1 -0
  31. checkpoints/0000015000/train_state_00003.json +1 -0
  32. config.yaml +127 -0
  33. metrics.jsonl +0 -0
  34. profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35064.html +0 -0
  35. profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35065.html +0 -0
  36. profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35066.html +0 -0
  37. profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35067.html +0 -0
  38. profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35064.1736919254163264561.pt.trace.json.gz +3 -0
  39. profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35065.1736919254158506263.pt.trace.json.gz +3 -0
  40. profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35066.1736919254163306670.pt.trace.json.gz +3 -0
  41. profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35067.1736919254164694530.pt.trace.json.gz +3 -0
  42. train.log +0 -0
.gitattributes CHANGED
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoints/0000015000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ checkpoints/0000012500/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints/0000015000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ checkpoints/0000012500/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ checkpoints/0000010000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ checkpoints/0000015000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ checkpoints/0000010000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ checkpoints/0000015000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ checkpoints/0000012500/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
45
+ checkpoints/0000010000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
46
+ checkpoints/0000012500/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
47
+ checkpoints/0000010000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoints/0000010000/.metadata ADDED
Binary file (891 kB). View file
 
checkpoints/0000010000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:157cef7da550e7e8bdce996cf92ae34dcae2fc8cb48eabdd884d1434c8c3f8bb
3
+ size 6269795820
checkpoints/0000010000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccf38ad82972b1033f359aefe7171eb4f7c1caf0eb16ef25d0710252a632685e
3
+ size 6269918456
checkpoints/0000010000/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86a77cc843db329714c7e495cfcff9cafaf2f1071256807f40cc152f48f126a
3
+ size 6269918456
checkpoints/0000010000/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5733f61727f4cef3082d76e972aeaab3f4d1d90417bfbec1f253f4c1de10dc
3
+ size 6269929976
checkpoints/0000010000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "large_lm", "dump_dir": "./dump_dir_llama1b2-mla-nope-2", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": 128, "n_heads": 48, "n_kv_heads": 48, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "none", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "simple", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2-mla-nope-2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}}
checkpoints/0000010000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 5854, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27139949444, "block_size": 4, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 150743230930327169744799492781598786875, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 128, "rng_state": {"bit_generator": "PCG64", "state": {"state": 6884758698580062763871696989832130994, "inc": 77357518920597472829800677777012462921}, "has_uint32": 1, "uinteger": 4238670441}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 10000, "verbose": false, "_step_count": 10001, "_get_lr_called_within_step": false, "_last_lr": [0.002939239521182286], "lr_lambdas": [{}]}}
checkpoints/0000010000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 910, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27234304200, "block_size": 4, "offset": 1, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 167417803618091562229677569034779292078, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 128, "rng_state": {"bit_generator": "PCG64", "state": {"state": 142840750648924094080300450844583521998, "inc": 270234035871729269002159329014059236425}, "has_uint32": 1, "uinteger": 2360336988}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 10000, "verbose": false, "_step_count": 10001, "_get_lr_called_within_step": false, "_last_lr": [0.002939239521182286], "lr_lambdas": [{}]}}
checkpoints/0000010000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 468, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27150856174, "block_size": 4, "offset": 2, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 71908710999316559150302827820763076717, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 128, "rng_state": {"bit_generator": "PCG64", "state": {"state": 290492482154398532809935945696334257064, "inc": 188564971970541749319992297790591572713}, "has_uint32": 0, "uinteger": 1252568123}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 10000, "verbose": false, "_step_count": 10001, "_get_lr_called_within_step": false, "_last_lr": [0.002939239521182286], "lr_lambdas": [{}]}}
checkpoints/0000010000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 2380, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 27184167541, "block_size": 4, "offset": 3, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 327681619799909732134614670727857379100, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 128, "rng_state": {"bit_generator": "PCG64", "state": {"state": 325056812806434293018455649520496755860, "inc": 66050176413739185524746886687120723265}, "has_uint32": 0, "uinteger": 3173786020}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 10000, "verbose": false, "_step_count": 10001, "_get_lr_called_within_step": false, "_last_lr": [0.002939239521182286], "lr_lambdas": [{}]}}
checkpoints/0000012500/.metadata ADDED
Binary file (891 kB). View file
 
checkpoints/0000012500/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a08f591662b8e1f1994dcde896c9c9554f77c200e7353d88dda2b244788f662
3
+ size 6269795820
checkpoints/0000012500/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a99337a63d3c5ba6a30cc8adcf9fea6457aa8bd59c6c5f20f9a177356c731035
3
+ size 6269918456
checkpoints/0000012500/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:682f7d63506dda5ebbfc1b5744c63e1e4d5ec4510993174e97183dac8984df9d
3
+ size 6269918456
checkpoints/0000012500/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153474b2d8233d70f426d7c06397ea8996f3c53324babb0c5d51c82255f2bd8d
3
+ size 6269929976
checkpoints/0000012500/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "large_lm", "dump_dir": "./dump_dir_llama1b2-mla-nope-2", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": 128, "n_heads": 48, "n_kv_heads": 48, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "none", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "simple", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2-mla-nope-2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}}
checkpoints/0000012500/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 12500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 2508, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 33779328140, "block_size": 4, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 66845093718062145952885693940422546698, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 132651110094990385089663171643329315190, "inc": 77357518920597472829800677777012462921}, "has_uint32": 1, "uinteger": 818147430}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 12500, "verbose": false, "_step_count": 12501, "_get_lr_called_within_step": false, "_last_lr": [0.0028644481285837846], "lr_lambdas": [{}]}}
checkpoints/0000012500/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 12500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 619, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 33856284353, "block_size": 4, "offset": 1, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 36077557639709628631524445109225845135, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 254953852730806962758153674603584072950, "inc": 270234035871729269002159329014059236425}, "has_uint32": 0, "uinteger": 276092175}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 12500, "verbose": false, "_step_count": 12501, "_get_lr_called_within_step": false, "_last_lr": [0.0028644481285837846], "lr_lambdas": [{}]}}
checkpoints/0000012500/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 12500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 365, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 33773748021, "block_size": 4, "offset": 2, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 335229104681307419044321615228439932641, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 173198904928546993181710233269373117070, "inc": 188564971970541749319992297790591572713}, "has_uint32": 1, "uinteger": 1263483293}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 12500, "verbose": false, "_step_count": 12501, "_get_lr_called_within_step": false, "_last_lr": [0.0028644481285837846], "lr_lambdas": [{}]}}
checkpoints/0000012500/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 12500, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 2819, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 33776023826, "block_size": 4, "offset": 3, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 39578767758864957214411806561586121746, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 672, "rng_state": {"bit_generator": "PCG64", "state": {"state": 260255623020490950593466523618230265090, "inc": 66050176413739185524746886687120723265}, "has_uint32": 1, "uinteger": 2482301038}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 12500, "verbose": false, "_step_count": 12501, "_get_lr_called_within_step": false, "_last_lr": [0.0028644481285837846], "lr_lambdas": [{}]}}
checkpoints/0000015000/.metadata ADDED
Binary file (891 kB). View file
 
checkpoints/0000015000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f051c6153a26f7629602e4ac14cb632a74fb4f4a7b88cd98b64eeb129bcc3947
3
+ size 6269795820
checkpoints/0000015000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34fa8d2a856e15f6f0ddba3810d377b9ab15ee244b109bc38a6dba9d2b715ce9
3
+ size 6269918456
checkpoints/0000015000/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21a14230dc75a3d25ae3d1f0a6396b12b0a1ab5416b382ca698e3e0dec0c976
3
+ size 6269918456
checkpoints/0000015000/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:609b2eeca07825dd3f7ea0b5a2850c512a0b68553b8390583404e8048e30cb54
3
+ size 6269929976
checkpoints/0000015000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "large_lm", "dump_dir": "./dump_dir_llama1b2-mla-nope-2", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": 128, "n_heads": 48, "n_kv_heads": 48, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "rope_type": "none", "rope_inv_freq_learnable": false, "max_seqlen": 4096, "use_mla": "simple", "q_lora_rank": 1536, "kv_lora_rank": 512, "seed": 42, "vocab_size": 100512, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 4, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": 3}, "eval": {"every": 5000000000, "keep": -1}, "path": "dump_dir_llama1b2-mla-nope-2/checkpoints", "init_ckpt_path": null, "continue_training_from_init": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 1, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", "copa"]}, "validation": {"max_steps": 1000}, "generator": {"max_tokens": 16384, "dtype": "bf16"}}}
checkpoints/0000015000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1805, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 40715081775, "block_size": 4, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 251662406972816758099174584040120893882, "inc": 11676600559890430755450356507027720041}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 192, "rng_state": {"bit_generator": "PCG64", "state": {"state": 104024892605924312590652555861151974329, "inc": 77357518920597472829800677777012462921}, "has_uint32": 0, "uinteger": 1828438254}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 15000, "verbose": false, "_step_count": 15001, "_get_lr_called_within_step": false, "_last_lr": [0.0027618805373664725], "lr_lambdas": [{}]}}
checkpoints/0000015000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 52, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 40815317393, "block_size": 4, "offset": 1, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 235612834052381029402277114709496902488, "inc": 239634081480473411747239400828488620799}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 192, "rng_state": {"bit_generator": "PCG64", "state": {"state": 251010249832081823055400505834513794530, "inc": 270234035871729269002159329014059236425}, "has_uint32": 1, "uinteger": 83291782}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 15000, "verbose": false, "_step_count": 15001, "_get_lr_called_within_step": false, "_last_lr": [0.0027618805373664725], "lr_lambdas": [{}]}}
checkpoints/0000015000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3445, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 40774849393, "block_size": 4, "offset": 2, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 58593197680791640417035883735834969432, "inc": 6027823433652931085739778990793808165}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 192, "rng_state": {"bit_generator": "PCG64", "state": {"state": 309383463004337416263254616398486634642, "inc": 188564971970541749319992297790591572713}, "has_uint32": 1, "uinteger": 427777537}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 15000, "verbose": false, "_step_count": 15001, "_get_lr_called_within_step": false, "_last_lr": [0.0027618805373664725], "lr_lambdas": [{}]}}
checkpoints/0000015000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 678, "it_state": {"it_state": {"root_dir": "./data", "sources": {"fineweb_edu_10bt_shuffled": 100.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 40762562516, "block_size": 4, "offset": 3, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 180490400181664997602061835664789889891, "inc": 92941856108932518968286621281627530405}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "tiktoken", "path": "tokenizers/cl100k_base.tiktoken"}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 192, "rng_state": {"bit_generator": "PCG64", "state": {"state": 2077334738881764798330840083423581321, "inc": 66050176413739185524746886687120723265}, "has_uint32": 1, "uinteger": 2643615857}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 15000, "verbose": false, "_step_count": 15001, "_get_lr_called_within_step": false, "_last_lr": [0.0027618805373664725], "lr_lambdas": [{}]}}
config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: large_lm
2
+ dump_dir: ./dump_dir_llama1b2-mla-nope-2
3
+ seed: 777
4
+ grad_acc_steps: 8
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 60000
8
+ data:
9
+ root_dir: ./data
10
+ sources:
11
+ fineweb_edu_10bt_shuffled: 100.0
12
+ batch_size: 4
13
+ seq_len: 4096
14
+ n_views: 2
15
+ seed: 42
16
+ add_bos: true
17
+ add_eos: true
18
+ load_async: true
19
+ prefetch_size: 1024
20
+ tokenizer:
21
+ name: tiktoken
22
+ path: tokenizers/cl100k_base.tiktoken
23
+ optim:
24
+ lr: 0.003
25
+ weight_decay: 0.033
26
+ epsilon: 1.0e-08
27
+ beta1: 0.9
28
+ beta2: 0.95
29
+ clip: 1.0
30
+ scheduler: cosine
31
+ warmup: 5000
32
+ lr_min_ratio: 1.0e-06
33
+ cycle_length: 1.0
34
+ cosine_theta: 1.0
35
+ annealing_step: 1000
36
+ decay_fraction: 0.1
37
+ exp_factor: 0.5
38
+ model:
39
+ dim: 2048
40
+ n_layers: 25
41
+ head_dim: 128
42
+ n_heads: 48
43
+ n_kv_heads: 48
44
+ ffn_dim_multiplier: null
45
+ multiple_of: 256
46
+ norm_eps: 1.0e-05
47
+ rope_theta: 10000.0
48
+ init_base_std: null
49
+ init_std_factor: disabled
50
+ rope_type: none
51
+ rope_inv_freq_learnable: false
52
+ max_seqlen: 4096
53
+ use_mla: simple
54
+ q_lora_rank: 1536
55
+ kv_lora_rank: 512
56
+ seed: 42
57
+ vocab_size: 100512
58
+ weight_tying: false
59
+ sliding_window: null
60
+ distributed:
61
+ dp_shard: 1
62
+ dp_replicate: 4
63
+ tp_size: 1
64
+ selective_activation_checkpointing: false
65
+ compile: true
66
+ fsdp_type: full_shard
67
+ model_dtype: bf16
68
+ float8_recipe: null
69
+ float8_filter: layers\.[0-9]+\.
70
+ matmul_allow_tf32: true
71
+ detect_anomaly: false
72
+ compile_cache_size_limit: 8
73
+ spawn_method: forkserver
74
+ env:
75
+ MKL_SERVICE_FORCE_INTEL: GNU
76
+ OMP_NUM_THREADS: '1'
77
+ MKL_NUM_THREADS: '1'
78
+ ENABLE_INTRA_NODE_COMM: '1'
79
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
80
+ NCCL_IB_TIMEOUT: '22'
81
+ NCCL_DEBUG: INFO
82
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
83
+ checkpoint:
84
+ dump:
85
+ every: 2500
86
+ keep: 3
87
+ eval:
88
+ every: 5000000000
89
+ keep: -1
90
+ path: dump_dir_llama1b2-mla-nope-2/checkpoints
91
+ init_ckpt_path: null
92
+ continue_training_from_init: false
93
+ profiling:
94
+ run: true
95
+ trace_folder: profiling
96
+ mem_warmup: 0
97
+ mem_steps: 4
98
+ profile_warmup: 100
99
+ profile_steps: 4
100
+ logging:
101
+ freq: 1
102
+ acc_freq: null
103
+ wandb: null
104
+ async_eval_gpus: 1
105
+ eval:
106
+ harness:
107
+ tasks:
108
+ - hellaswag
109
+ - task: boolq
110
+ dataset_kwargs:
111
+ trust_remote_code: true
112
+ - piqa
113
+ - task: social_iqa
114
+ dataset_kwargs:
115
+ trust_remote_code: true
116
+ - winogrande
117
+ - openbookqa
118
+ - arc_easy
119
+ - arc_challenge
120
+ - race
121
+ - commonsense_qa
122
+ - copa
123
+ validation:
124
+ max_steps: 1000
125
+ generator:
126
+ max_tokens: 16384
127
+ dtype: bf16
metrics.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35064.html ADDED
The diff for this file is too large to render. See raw diff
 
profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35065.html ADDED
The diff for this file is too large to render. See raw diff
 
profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35066.html ADDED
The diff for this file is too large to render. See raw diff
 
profiling/memory_trace_plot/000004_stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35067.html ADDED
The diff for this file is too large to render. See raw diff
 
profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35064.1736919254163264561.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50df3653ac709c4463b85b1aeaf5bb85e1f39d0ed162b6f8b92877336f9dc85b
3
+ size 2433872
profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35065.1736919254158506263.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf0e3323b801c8d82d38e5746ab38e8080a563a65b3171278ca71fb5548d7af
3
+ size 2440005
profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35066.1736919254163306670.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:305584f1710c8c7e50401aa2de28906da9972e884c063db345b4d43c647126c7
3
+ size 2439209
profiling/profile_CPU_CUDA_000104/stable-diffusion-xl-dev-2-retina-newsroom-gpu-v3-85f5d97fdc6s78_35067.1736919254164694530.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cfae9143ff4446a517f24b4598c5a68858f090e5eb159ec4c331c9e59adff8
3
+ size 2440322
train.log ADDED
The diff for this file is too large to render. See raw diff