File size: 2,636 Bytes
5a8b19c
 
 
 
 
 
 
c6c38be
5a8b19c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c38be
5a8b19c
 
 
 
 
c6c38be
5a8b19c
 
 
 
 
c6c38be
 
5a8b19c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
{
    "model_name": "1l-gelu",
    "model_class_name": "HookedTransformer",
    "hook_name": "blocks.0.hook_resid_post",
    "hook_eval": "NOT_IN_USE",
    "hook_layer": 0,
    "hook_head_index": null,
    "dataset_path": "ghidav/arithmetics_reversed",
    "dataset_trust_remote_code": true,
    "streaming": true,
    "is_dataset_tokenized": true,
    "context_size": 18,
    "use_cached_activations": false,
    "cached_activations_path": null,
    "architecture": "jumprelu",
    "d_in": 512,
    "d_sae": 8192,
    "b_dec_init_method": "zeros",
    "expansion_factor": 16,
    "activation_fn": "relu",
    "activation_fn_kwargs": {},
    "normalize_sae_decoder": true,
    "noise_scale": 0.0,
    "from_pretrained_path": null,
    "apply_b_dec_to_input": false,
    "decoder_orthogonal_init": false,
    "decoder_heuristic_init": false,
    "init_encoder_as_decoder_transpose": false,
    "n_batches_in_buffer": 128,
    "training_tokens": 50000000,
    "finetuning_tokens": 0,
    "store_batch_size_prompts": 8,
    "train_batch_size_tokens": 1024,
    "normalize_activations": "none",
    "device": "cuda",
    "act_store_device": "cuda",
    "seed": 42,
    "dtype": "float32",
    "prepend_bos": false,
    "autocast": false,
    "autocast_lm": false,
    "compile_llm": false,
    "llm_compilation_mode": null,
    "compile_sae": false,
    "sae_compilation_mode": null,
    "adam_beta1": 0,
    "adam_beta2": 0.999,
    "mse_loss_normalization": null,
    "l1_coefficient": 2.0,
    "lp_norm": 1,
    "scale_sparsity_penalty_by_decoder_norm": false,
    "l1_warm_up_steps": 2441,
    "lr": 0.0005,
    "lr_scheduler_name": "constant",
    "lr_warm_up_steps": 0,
    "lr_end": 5e-05,
    "lr_decay_steps": 9765,
    "n_restart_cycles": 1,
    "finetuning_method": null,
    "use_ghost_grads": false,
    "feature_sampling_window": 2000,
    "dead_feature_window": 1000,
    "dead_feature_threshold": 1e-06,
    "n_eval_batches": 10,
    "eval_batch_size_prompts": null,
    "log_to_wandb": true,
    "log_activations_store_to_wandb": false,
    "log_optimizer_state_to_wandb": false,
    "wandb_project": "sae-feature-circuits",
    "wandb_id": null,
    "run_name": "L0_hook_resid_post_L1_2_0_rev",
    "wandb_entity": null,
    "wandb_log_frequency": 30,
    "eval_every_n_wandb_logs": 100,
    "resume": false,
    "n_checkpoints": 0,
    "checkpoint_path": "checkpoints/220duyfn",
    "verbose": false,
    "model_kwargs": {},
    "model_from_pretrained_kwargs": {
        "center_writing_weights": false
    },
    "sae_lens_version": "3.20.5",
    "sae_lens_training_version": "3.20.5",
    "tokens_per_buffer": 2359296
}