dimasik87
/

06770fdd-0810-4759-908a-5d296333672a

PEFT

Safetensors

llama

axolotl

Generated from Trainer

Model card Files Files and versions Community

dimasik87 commited on Dec 16, 2024

Commit

eb6c3c8

verified ·

1 Parent(s): 6d3edd0

End of training

Browse files

Files changed (2) hide show

README.md +31 -27
adapter_model.bin +2 -2

README.md CHANGED Viewed

@@ -43,12 +43,12 @@ early_stopping_patience: null
 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 4
-flash_attention: true
-fp16: true
 fsdp: null
 fsdp_config: null
 gradient_accumulation_steps: 4
-gradient_checkpointing: true
 group_by_length: false
 hub_model_id: dimasik87/06770fdd-0810-4759-908a-5d296333672a
 hub_repo: null
@@ -59,28 +59,28 @@ load_in_4bit: false
 load_in_8bit: false
 local_rank: null
 logging_steps: 1
-lora_alpha: 16
-lora_dropout: 0.1
 lora_fan_in_fan_out: null
 lora_model_dir: null
-lora_r: 8
 lora_target_linear: true
 lr_scheduler: cosine
 max_memory:
   0: 70GiB
-max_steps: 25
-micro_batch_size: 1
 mlflow_experiment_name: /tmp/55fc4b709c64c233_train_data.json
 model_type: AutoModelForCausalLM
-num_epochs: 3
-optimizer: adamw_torch
 output_dir: miner_id_24
 pad_to_sequence_len: true
 resume_from_checkpoint: null
 s2_attention: null
 sample_packing: false
-saves_per_epoch: 3
-sequence_len: 2028
 special_tokens:
   pad_token: <|eot_id|>
 strict: false
@@ -107,7 +107,7 @@ xformers_attention: null
 This model is a fine-tuned version of [elyza/Llama-3-ELYZA-JP-8B](https://huggingface.co/elyza/Llama-3-ELYZA-JP-8B) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.8398
 ## Model description
@@ -127,29 +127,33 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
-- train_batch_size: 1
-- eval_batch_size: 1
 - seed: 42
 - gradient_accumulation_steps: 4
-- total_train_batch_size: 4
-- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
-- training_steps: 25
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
-| 11.9265       | 0.0004 | 1    | 12.7843         |
-| 13.0835       | 0.0013 | 3    | 12.6270         |
-| 11.0406       | 0.0026 | 6    | 8.3529          |
-| 6.2842        | 0.0039 | 9    | 4.4139          |
-| 1.8928        | 0.0052 | 12   | 1.3278          |
-| 1.1507        | 0.0065 | 15   | 0.8007          |
-| 0.8566        | 0.0078 | 18   | 0.6212          |
-| 0.4075        | 0.0091 | 21   | 0.8347          |
-| 0.8743        | 0.0103 | 24   | 0.8398          |
 ### Framework versions

 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 4
+flash_attention: false
+fp16: null
 fsdp: null
 fsdp_config: null
 gradient_accumulation_steps: 4
+gradient_checkpointing: false
 group_by_length: false
 hub_model_id: dimasik87/06770fdd-0810-4759-908a-5d296333672a
 hub_repo: null
 load_in_8bit: false
 local_rank: null
 logging_steps: 1
+lora_alpha: 32
+lora_dropout: 0.05
 lora_fan_in_fan_out: null
 lora_model_dir: null
+lora_r: 16
 lora_target_linear: true
 lr_scheduler: cosine
 max_memory:
   0: 70GiB
+max_steps: 50
+micro_batch_size: 2
 mlflow_experiment_name: /tmp/55fc4b709c64c233_train_data.json
 model_type: AutoModelForCausalLM
+num_epochs: 4
+optimizer: adamw_bnb_8bit
 output_dir: miner_id_24
 pad_to_sequence_len: true
 resume_from_checkpoint: null
 s2_attention: null
 sample_packing: false
+saves_per_epoch: 4
+sequence_len: 1024
 special_tokens:
   pad_token: <|eot_id|>
 strict: false
 This model is a fine-tuned version of [elyza/Llama-3-ELYZA-JP-8B](https://huggingface.co/elyza/Llama-3-ELYZA-JP-8B) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.5962
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
 - seed: 42
 - gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
+- training_steps: 50
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
+| 12.2382       | 0.0009 | 1    | 12.7834         |
+| 12.0138       | 0.0034 | 4    | 10.4872         |
+| 4.8259        | 0.0069 | 8    | 1.5156          |
+| 0.5719        | 0.0103 | 12   | 0.8780          |
+| 0.9423        | 0.0138 | 16   | 0.5733          |
+| 0.4543        | 0.0172 | 20   | 0.8335          |
+| 1.3692        | 0.0207 | 24   | 1.0001          |
+| 0.9116        | 0.0241 | 28   | 0.6467          |
+| 0.7579        | 0.0276 | 32   | 0.8243          |
+| 0.4857        | 0.0310 | 36   | 0.6007          |
+| 0.723         | 0.0345 | 40   | 0.6944          |
+| 0.3938        | 0.0379 | 44   | 0.6142          |
+| 0.4404        | 0.0414 | 48   | 0.5962          |
 ### Framework versions

adapter_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18ce7fd112b589d7d16d3ef9e16af65cce4135900e8b26936b8a21e07351b8db
-size 84047370

 version https://git-lfs.github.com/spec/v1
+oid sha256:969fc9ac044e59ab6546496e2a64a8cdda6558338209e37f22e2dc20b1fe77f0
+size 167934026