name: large_lm dump_dir: ./dump_dir_llama1b2 seed: 777 grad_acc_steps: 2 gc_collect_freq: 1000 probe_freq: null steps: 60000 data: root_dir: ./data sources: fineweb_edu_10bt_shuffled: 100.0 batch_size: 8 seq_len: 4096 n_views: 2 seed: 42 add_bos: true add_eos: true load_async: true prefetch_size: 1024 tokenizer: name: tiktoken path: tokenizers/cl100k_base.tiktoken optim: lr: 0.003 weight_decay: 0.033 epsilon: 1.0e-08 beta1: 0.9 beta2: 0.95 clip: 1.0 scheduler: cosine warmup: 5000 lr_min_ratio: 1.0e-06 cycle_length: 1.0 cosine_theta: 1.0 annealing_step: 1000 decay_fraction: 0.1 exp_factor: 0.5 model: dim: 2048 n_layers: 25 head_dim: null n_heads: 16 n_kv_heads: null ffn_dim_multiplier: null multiple_of: 256 norm_eps: 1.0e-05 rope_theta: 10000.0 init_base_std: null init_std_factor: disabled rope_type: original rope_inv_freq_learnable: false max_seqlen: 4096 use_mla: '' q_lora_rank: 1536 kv_lora_rank: 512 seed: 42 vocab_size: 100512 weight_tying: false sliding_window: null distributed: dp_shard: 1 dp_replicate: 4 tp_size: 1 selective_activation_checkpointing: false compile: true fsdp_type: full_shard model_dtype: bf16 float8_recipe: null float8_filter: layers\.[0-9]+\. matmul_allow_tf32: true detect_anomaly: false compile_cache_size_limit: 8 spawn_method: forkserver env: MKL_SERVICE_FORCE_INTEL: GNU OMP_NUM_THREADS: '1' MKL_NUM_THREADS: '1' ENABLE_INTRA_NODE_COMM: '1' TORCH_NCCL_AVOID_RECORD_STREAMS: '1' NCCL_IB_TIMEOUT: '22' NCCL_DEBUG: INFO TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' checkpoint: dump: every: 2500 keep: 3 eval: every: 5000000000 keep: -1 path: dump_dir_llama1b2/checkpoints init_ckpt_path: null continue_training_from_init: false profiling: run: true trace_folder: profiling mem_warmup: 0 mem_steps: 4 profile_warmup: 100 profile_steps: 4 logging: freq: 1 acc_freq: null wandb: null async_eval_gpus: 1 eval: harness: tasks: - hellaswag - task: boolq dataset_kwargs: trust_remote_code: true - piqa - task: social_iqa dataset_kwargs: trust_remote_code: true - winogrande - openbookqa - arc_easy - arc_challenge - race - commonsense_qa - copa validation: max_steps: 1000 generator: max_tokens: 16384 dtype: bf16