ethanhs commited on
Commit
0957885
·
1 Parent(s): 0303e65

Update model card info and add axolotl config

Browse files
Files changed (2) hide show
  1. README.md +4 -0
  2. xgen-7b-8k-qlora.yml +88 -0
README.md CHANGED
@@ -14,6 +14,10 @@ As both the model and dataset are under the Apache-2.0 license, this model is un
14
  The model was trained for just over 3000 steps.
15
 
16
  ---
 
 
17
  license: apache-2.0
 
 
18
  ---
19
 
 
14
  The model was trained for just over 3000 steps.
15
 
16
  ---
17
+ language:
18
+ - en
19
  license: apache-2.0
20
+ datasets:
21
+ - timdettmers/openassistant-guanaco
22
  ---
23
 
xgen-7b-8k-qlora.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: Salesforce/xgen-7b-8k-base
2
+ base_model_config: Salesforce/xgen-7b-8k-base
3
+ trust_remote_code: true
4
+ model_type: AutoModelForCausalLM
5
+ tokenizer_type: AutoTokenizer
6
+ load_in_8bit: false
7
+ # enable 4bit for QLoRA
8
+ load_in_4bit: true
9
+ gptq: false
10
+ strict: false
11
+ push_dataset_to_hub:
12
+ datasets:
13
+ - path: timdettmers/openassistant-guanaco
14
+ data_files:
15
+ - openassistant_best_replies_train.jsonl
16
+ type: "completion"
17
+ dataset_prepared_path: last_run_prepared
18
+ val_set_size: 0.01
19
+ # enable QLoRA
20
+ adapter: qlora
21
+ lora_model_dir:
22
+ sequence_len: 8192
23
+ max_packed_sequence_len:
24
+
25
+ # hyperparameters from QLoRA paper Appendix B.2
26
+ # "We find hyperparameters to be largely robust across datasets"
27
+ lora_r: 64
28
+ lora_alpha: 16
29
+ # 0.1 for models up to 13B
30
+ # 0.05 for 33B and 65B models
31
+ lora_dropout: 0.05
32
+ # add LoRA modules on all linear layers of the base model
33
+ lora_target_modules:
34
+ lora_target_linear: true
35
+ lora_fan_in_fan_out:
36
+
37
+ wandb_project: xgen-7b-8k-guanaco-qlora
38
+ wandb_watch:
39
+ wandb_run_id:
40
+ wandb_log_model:
41
+ output_dir: ./xgen-7b-8k-guanaco-qlora
42
+
43
+ # QLoRA paper Table 9
44
+ # - 16 for 7b & 13b
45
+ # - 32 for 33b, 64 for 64b
46
+ # Max size tested on A6000
47
+ # - 7b: 40
48
+ # - 40b: 4
49
+ # decrease if OOM, increase for max VRAM utilization
50
+ micro_batch_size: 1
51
+ gradient_accumulation_steps: 1
52
+ num_epochs: 3
53
+ # Optimizer for QLoRA
54
+ optimizer: paged_adamw_32bit
55
+ torchdistx_path:
56
+ lr_scheduler: cosine
57
+ # QLoRA paper Table 9
58
+ # - 2e-4 for 7b & 13b
59
+ # - 1e-4 for 33b & 64b
60
+ learning_rate: 0.00002
61
+ train_on_inputs: false
62
+ group_by_length: false
63
+ bf16: true
64
+ fp16: false
65
+ tf32: false
66
+ gradient_checkpointing: true
67
+ # stop training after this many evaluation losses have increased in a row
68
+ # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
69
+ early_stopping_patience: 3
70
+ resume_from_checkpoint:
71
+ auto_resume_from_checkpoints: true
72
+ local_rank:
73
+ logging_steps: 1
74
+ xformers_attention: true
75
+ flash_attention:
76
+ gptq_groupsize:
77
+ gptq_model_v1:
78
+ warmup_steps: 10
79
+ eval_steps: 50
80
+ save_steps: 50
81
+ debug:
82
+ deepspeed:
83
+ weight_decay: 0.0
84
+ special_tokens:
85
+ eos_token: "<|endoftext|>"
86
+ bos_token: "<|endoftext|>"
87
+ unk_token: "<|endoftext|>"
88
+ pad_token: "<|endoftext|>"