bluryar commited on 15 days ago

Commit

426aeec

verified ·

1 Parent(s): 904d8d9

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
README.md +83 -0
all_results.json +12 -0
config.json +30 -0
eval_results.json +7 -0
generation_config.json +9 -0
llamaboard_config.yaml +77 -0
model.safetensors +3 -0
running_log.txt +0 -0
special_tokens_map.json +32 -0
tokenizer.json +3 -0
tokenizer_config.json +199 -0
train_results.json +8 -0
trainer_log.jsonl +0 -0
trainer_state.json +0 -0
training_args.bin +3 -0
training_args.yaml +40 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+---
+library_name: transformers
+license: other
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+tags:
+- llama-factory
+- full
+- generated_from_trainer
+model-index:
+- name: train_2025-01-23-00-42-56
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# train_2025-01-23-00-42-56
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the smoltalk_chinese dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.9459
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 2.0
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.8719        | 1.7649 | 5000 | 1.9466          |
+### Framework versions
+- Transformers 4.46.1
+- Pytorch 2.5.1+cu124
+- Datasets 3.1.0
+- Tokenizers 0.20.3
+### MISC
+```python
+_register_template(
+    name="deepseekr1",
+    default_system="You are a helpful and harmless assistant. You should think step-by-step. Output your thoughts in <think></think> tags.",
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}"]),
+    format_user=StringFormatter(
+        slots=[
+            "<｜User｜>{{content}}<｜Assistant｜>"
+        ]
+    ),
+    stop_words=["<｜end▁of▁sentence｜>"],
+)
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.0,
+    "eval_loss": 1.9459081888198853,
+    "eval_runtime": 5.5654,
+    "eval_samples_per_second": 32.702,
+    "eval_steps_per_second": 2.156,
+    "total_flos": 1.317227096577147e+18,
+    "train_loss": 2.02712393619128,
+    "train_runtime": 22983.8323,
+    "train_samples_per_second": 15.776,
+    "train_steps_per_second": 0.247
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.1",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.0,
+    "eval_loss": 1.9459081888198853,
+    "eval_runtime": 5.5654,
+    "eval_samples_per_second": 32.702,
+    "eval_steps_per_second": 2.156
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.46.1"
+}

llamaboard_config.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+top.booster: flashattn2
+top.checkpoint_path: null
+top.finetuning_type: full
+top.model_name: DeepSeek-R1-Distill-Qwen-1.5B
+top.quantization_bit: none
+top.quantization_method: bitsandbytes
+top.rope_scaling: none
+top.template: deepseekr1
+train.additional_target: ''
+train.apollo_rank: 16
+train.apollo_scale: 32
+train.apollo_target: all
+train.apollo_update_interval: 200
+train.badam_mode: layer
+train.badam_switch_interval: 100
+train.badam_switch_mode: ascending
+train.badam_update_ratio: 0.1
+train.batch_size: 4
+train.compute_type: bf16
+train.create_new_adapter: false
+train.cutoff_len: 4096
+train.dataset:
+- smoltalk_chinese
+train.dataset_dir: data
+train.ds_offload: false
+train.ds_stage: '3'
+train.extra_args: '{"optim": "adamw_torch"}'
+train.freeze_extra_modules: ''
+train.freeze_trainable_layers: 2
+train.freeze_trainable_modules: all
+train.galore_rank: 16
+train.galore_scale: 2
+train.galore_target: all
+train.galore_update_interval: 200
+train.gradient_accumulation_steps: 4
+train.learning_rate: 3e-4
+train.logging_steps: 5
+train.lora_alpha: 16
+train.lora_dropout: 0
+train.lora_rank: 8
+train.lora_target: ''
+train.loraplus_lr_ratio: 0
+train.lr_scheduler_type: cosine
+train.mask_history: false
+train.max_grad_norm: '2.0'
+train.max_samples: '10000000'
+train.neat_packing: true
+train.neftune_alpha: 5
+train.num_train_epochs: '2.0'
+train.packing: true
+train.ppo_score_norm: false
+train.ppo_whiten_rewards: false
+train.pref_beta: 0.1
+train.pref_ftx: 0
+train.pref_loss: sigmoid
+train.report_to:
+- none
+train.resize_vocab: false
+train.reward_model: []
+train.save_steps: 5000
+train.swanlab_api_key: ''
+train.swanlab_mode: cloud
+train.swanlab_project: llamafactory
+train.swanlab_run_name: ''
+train.swanlab_workspace: ''
+train.train_on_prompt: false
+train.training_stage: Supervised Fine-Tuning
+train.use_apollo: false
+train.use_badam: true
+train.use_dora: false
+train.use_galore: false
+train.use_llama_pro: false
+train.use_pissa: false
+train.use_rslora: false
+train.use_swanlab: false
+train.val_size: 0.001
+train.warmup_steps: 100

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d86865ddfe03040124d675286fa50334dcec10da725c8f0006e86acb2efaf5b5
+size 3554214752

running_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
+size 11422778

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<｜end▁of▁sentence｜>"
+  ],
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "legacy": true,
+  "model_max_length": 4096,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "split_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.0,
+    "total_flos": 1.317227096577147e+18,
+    "train_loss": 2.02712393619128,
+    "train_runtime": 22983.8323,
+    "train_samples_per_second": 15.776,
+    "train_steps_per_second": 0.247
+}

trainer_log.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bdbcc63bb1e183bf81e76ee8bede2261cebb6ebefc9ee52cec53820c60e9af1
+size 7416

training_args.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+badam_mode: layer
+badam_switch_interval: 100
+badam_switch_mode: ascending
+badam_update_ratio: 0.1
+bf16: true
+cutoff_len: 4096
+dataset: smoltalk_chinese
+dataset_dir: data
+ddp_timeout: 180000000
+deepspeed: cache/ds_z3_config.json
+do_train: true
+eval_steps: 5000
+eval_strategy: steps
+finetuning_type: full
+flash_attn: fa2
+gradient_accumulation_steps: 4
+learning_rate: 0.0003
+logging_steps: 5
+lr_scheduler_type: cosine
+max_grad_norm: 2.0
+max_samples: 10000000
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+neat_packing: true
+neftune_noise_alpha: 5
+num_train_epochs: 2.0
+optim: adamw_torch
+output_dir: saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56
+packing: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: none
+save_steps: 5000
+stage: sft
+template: deepseekr1
+trust_remote_code: true
+use_badam: true
+val_size: 0.001
+warmup_steps: 100

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed