Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 20250120235238/rank1.log +395 -0
- 20250120235238/rank11.log +395 -0
- 20250120235238/rank13.log +395 -0
- 20250120235238/rank15.log +395 -0
- 20250120235238/rank17.log +395 -0
- 20250120235238/rank18.log +395 -0
- 20250120235238/rank21.log +395 -0
- 20250120235238/rank23.log +395 -0
- 20250120235238/rank25.log +395 -0
- 20250120235238/rank26.log +395 -0
- 20250120235238/rank3.log +395 -0
- 20250120235238/rank31.log +395 -0
- 20250120235238/rank33.log +395 -0
- 20250120235238/rank34.log +395 -0
- 20250120235238/rank35.log +395 -0
- 20250120235238/rank38.log +395 -0
- 20250120235238/rank4.log +395 -0
- 20250120235238/rank40.log +395 -0
- 20250120235238/rank42.log +395 -0
- 20250120235238/rank43.log +395 -0
- 20250120235238/rank44.log +395 -0
- 20250120235238/rank46.log +395 -0
- 20250120235238/rank49.log +395 -0
- 20250120235238/rank5.log +395 -0
- 20250120235238/rank54.log +395 -0
- 20250120235238/rank56.log +395 -0
- 20250120235238/rank59.log +395 -0
- 20250120235238/rank6.log +395 -0
- 20250120235238/rank60.log +395 -0
- 20250120235238/rank62.log +395 -0
- 20250120235238/rank7.log +395 -0
- 20250120235238/rank8.log +395 -0
- 20250120235238/rank9.log +395 -0
- 20250121104251/rank0.log +0 -0
- 20250121104251/rank16.log +294 -0
- 20250121104251/rank30.log +294 -0
- 20250121104251/rank45.log +294 -0
- 20250121104251/rank47.log +294 -0
- 20250121165312/hf-593/added_tokens.json +24 -0
- 20250121165312/hf-593/config.json +30 -0
- 20250121165312/hf-593/merges.txt +0 -0
- 20250121165312/hf-593/model.safetensors.index.json +970 -0
- 20250121165312/hf-593/special_tokens_map.json +31 -0
- 20250121165312/hf-593/vocab.json +0 -0
- 20250121165312/rank0.log +0 -0
- 20250121165312/rank10.log +0 -0
- 20250121165312/rank13.log +0 -0
- 20250121165312/rank14.log +0 -0
- 20250121165312/rank18.log +0 -0
- 20250121165312/rank21.log +0 -0
20250120235238/rank1.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.19s
|
| 12 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:07:54][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 149.58 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.7GB text_tokens: 31548.0 tgs: 57 data_time: 2.14s time: 546.74s eta: 3 days, 18:03:37
|
| 258 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32129.0 tgs: 61 data_time: 0.80s time: 523.23s eta: 3 days, 14:02:31
|
| 260 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.9GB text_tokens: 31947.0 tgs: 61 data_time: 0.86s time: 522.93s eta: 3 days, 13:50:53
|
| 262 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31471.0 tgs: 60 data_time: 0.76s time: 520.27s eta: 3 days, 13:16:02
|
| 264 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 32503.0 tgs: 62 data_time: 0.78s time: 520.97s eta: 3 days, 13:14:13
|
| 266 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.7GB text_tokens: 31133.0 tgs: 59 data_time: 0.90s time: 520.90s eta: 3 days, 13:04:49
|
| 268 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31847.0 tgs: 60 data_time: 1.21s time: 523.37s eta: 3 days, 13:20:16
|
| 270 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.362 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32030.0 tgs: 61 data_time: 0.96s time: 520.65s eta: 3 days, 12:44:58
|
| 272 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 31786.0 tgs: 61 data_time: 0.75s time: 520.16s eta: 3 days, 12:31:35
|
| 274 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 32078.0 tgs: 61 data_time: 1.09s time: 520.48s eta: 3 days, 12:25:58
|
| 276 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32240.0 tgs: 61 data_time: 0.89s time: 524.51s eta: 3 days, 12:56:31
|
| 278 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31632.0 tgs: 60 data_time: 0.86s time: 520.63s eta: 3 days, 12:10:08
|
| 280 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.1GB text_tokens: 30691.0 tgs: 59 data_time: 0.79s time: 519.96s eta: 3 days, 11:54:57
|
| 282 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32119.0 tgs: 61 data_time: 0.88s time: 521.27s eta: 3 days, 11:58:57
|
| 284 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 31154.0 tgs: 59 data_time: 0.67s time: 524.11s eta: 3 days, 12:17:39
|
| 286 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.9GB text_tokens: 31896.0 tgs: 61 data_time: 0.65s time: 520.52s eta: 3 days, 11:34:22
|
| 288 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31357.0 tgs: 60 data_time: 0.93s time: 518.89s eta: 3 days, 11:09:59
|
| 290 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 30112.0 tgs: 57 data_time: 0.71s time: 522.09s eta: 3 days, 11:32:03
|
| 292 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.1GB text_tokens: 31775.0 tgs: 60 data_time: 0.88s time: 523.81s eta: 3 days, 11:39:52
|
| 294 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32263.0 tgs: 61 data_time: 0.80s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31621.0 tgs: 60 data_time: 1.09s time: 518.42s eta: 3 days, 10:30:55
|
| 298 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.400 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 32165.0 tgs: 61 data_time: 1.11s time: 522.82s eta: 3 days, 11:04:12
|
| 300 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31731.0 tgs: 60 data_time: 0.60s time: 523.52s eta: 3 days, 11:02:09
|
| 302 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31953.0 tgs: 61 data_time: 0.74s time: 521.01s eta: 3 days, 10:29:36
|
| 304 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 32071.0 tgs: 61 data_time: 0.84s time: 519.09s eta: 3 days, 10:02:39
|
| 306 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32232.0 tgs: 61 data_time: 0.64s time: 523.28s eta: 3 days, 10:33:44
|
| 308 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.196 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31872.0 tgs: 60 data_time: 0.70s time: 522.96s eta: 3 days, 10:21:58
|
| 310 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 31715.0 tgs: 60 data_time: 0.81s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31142.0 tgs: 59 data_time: 0.69s time: 520.62s eta: 3 days, 9:42:30
|
| 314 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.7GB text_tokens: 31246.0 tgs: 59 data_time: 1.11s time: 522.42s eta: 3 days, 9:50:43
|
| 316 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.8GB text_tokens: 31427.0 tgs: 60 data_time: 0.89s time: 523.77s eta: 3 days, 9:54:43
|
| 318 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.319 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 31009.0 tgs: 59 data_time: 0.83s time: 520.41s eta: 3 days, 9:14:30
|
| 320 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.6GB text_tokens: 31125.0 tgs: 59 data_time: 0.70s time: 520.66s eta: 3 days, 9:08:10
|
| 322 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 32138.0 tgs: 61 data_time: 0.92s time: 521.32s eta: 3 days, 9:05:40
|
| 324 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.8GB text_tokens: 31588.0 tgs: 60 data_time: 0.95s time: 524.09s eta: 3 days, 9:22:48
|
| 326 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.8GB text_tokens: 31748.0 tgs: 60 data_time: 0.90s time: 520.94s eta: 3 days, 8:44:45
|
| 328 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 32016.0 tgs: 61 data_time: 0.93s time: 520.21s eta: 3 days, 8:29:19
|
| 330 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.9GB text_tokens: 31540.0 tgs: 60 data_time: 0.79s time: 522.55s eta: 3 days, 8:42:18
|
| 332 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 30666.0 tgs: 58 data_time: 0.84s time: 524.31s eta: 3 days, 8:49:54
|
| 334 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31939.0 tgs: 61 data_time: 0.79s time: 519.87s eta: 3 days, 8:00:07
|
| 336 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 30286.0 tgs: 58 data_time: 0.84s time: 520.12s eta: 3 days, 7:53:45
|
| 338 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32181.0 tgs: 61 data_time: 0.92s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32188.0 tgs: 61 data_time: 0.75s time: 524.34s eta: 3 days, 8:15:09
|
| 342 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.370 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32373.0 tgs: 62 data_time: 0.71s time: 520.56s eta: 3 days, 7:31:46
|
| 344 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.7GB text_tokens: 30468.0 tgs: 58 data_time: 0.67s time: 518.55s eta: 3 days, 7:04:42
|
| 346 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31398.0 tgs: 60 data_time: 0.77s time: 523.14s eta: 3 days, 7:37:59
|
| 348 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.9GB text_tokens: 30639.0 tgs: 58 data_time: 0.59s time: 523.61s eta: 3 days, 7:33:33
|
| 350 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32054.0 tgs: 61 data_time: 0.94s time: 520.20s eta: 3 days, 6:53:50
|
| 352 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.8GB text_tokens: 31329.0 tgs: 60 data_time: 0.91s time: 520.04s eta: 3 days, 6:43:41
|
| 354 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 30789.0 tgs: 58 data_time: 0.79s time: 523.74s eta: 3 days, 7:08:37
|
| 356 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31630.0 tgs: 60 data_time: 0.80s time: 523.82s eta: 3 days, 7:00:35
|
| 358 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 31039.0 tgs: 59 data_time: 0.80s time: 520.22s eta: 3 days, 6:19:20
|
| 360 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31956.0 tgs: 61 data_time: 1.05s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 32128.0 tgs: 61 data_time: 0.72s time: 522.67s eta: 3 days, 6:24:02
|
| 364 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 31665.0 tgs: 60 data_time: 0.69s time: 522.76s eta: 3 days, 6:16:10
|
| 366 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 31563.0 tgs: 60 data_time: 0.74s time: 520.51s eta: 3 days, 5:47:14
|
| 368 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32095.0 tgs: 61 data_time: 0.70s time: 519.64s eta: 3 days, 5:30:48
|
| 370 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32082.0 tgs: 61 data_time: 1.08s time: 521.33s eta: 3 days, 5:37:15
|
| 372 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.1GB text_tokens: 32250.0 tgs: 61 data_time: 0.70s time: 523.57s eta: 3 days, 5:48:32
|
| 374 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31468.0 tgs: 60 data_time: 0.82s time: 519.29s eta: 3 days, 5:01:41
|
| 376 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 31281.0 tgs: 60 data_time: 0.84s time: 520.67s eta: 3 days, 5:05:15
|
| 378 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 31987.0 tgs: 61 data_time: 0.54s time: 521.47s eta: 3 days, 5:03:40
|
| 380 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.3GB text_tokens: 30618.0 tgs: 58 data_time: 0.61s time: 523.93s eta: 3 days, 5:16:46
|
| 382 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 32.4GB text_tokens: 30691.0 tgs: 59 data_time: 0.67s time: 518.83s eta: 3 days, 4:23:02
|
| 384 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 30766.0 tgs: 59 data_time: 0.64s time: 519.27s eta: 3 days, 4:18:14
|
| 386 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 30736.0 tgs: 59 data_time: 0.78s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 32451.0 tgs: 61 data_time: 0.74s time: 524.24s eta: 3 days, 4:44:33
|
| 390 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.0GB text_tokens: 30934.0 tgs: 59 data_time: 0.93s time: 520.34s eta: 3 days, 4:01:38
|
| 392 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.174 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32071.0 tgs: 61 data_time: 0.77s time: 519.52s eta: 3 days, 3:45:47
|
| 394 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 1][DP 0][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31757.0 tgs: 60 data_time: 0.94s time: 522.30s eta: 3 days, 4:01:25
|
20250120235238/rank11.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.13s
|
| 12 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.88 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31696.0 tgs: 57 data_time: 1.88s time: 548.03s eta: 3 days, 18:16:21
|
| 258 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31732.0 tgs: 60 data_time: 1.03s time: 523.24s eta: 3 days, 14:02:39
|
| 260 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32529.0 tgs: 62 data_time: 0.89s time: 522.88s eta: 3 days, 13:50:21
|
| 262 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31543.0 tgs: 60 data_time: 0.85s time: 520.29s eta: 3 days, 13:16:11
|
| 264 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31681.0 tgs: 60 data_time: 0.94s time: 520.98s eta: 3 days, 13:14:18
|
| 266 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.4GB text_tokens: 30335.0 tgs: 58 data_time: 0.82s time: 520.90s eta: 3 days, 13:04:46
|
| 268 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.8GB text_tokens: 30779.0 tgs: 58 data_time: 0.68s time: 523.38s eta: 3 days, 13:20:26
|
| 270 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31645.0 tgs: 60 data_time: 0.91s time: 520.66s eta: 3 days, 12:45:07
|
| 272 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32125.0 tgs: 61 data_time: 0.66s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31629.0 tgs: 60 data_time: 0.91s time: 520.41s eta: 3 days, 12:25:22
|
| 276 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.347 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.9GB text_tokens: 31480.0 tgs: 60 data_time: 0.93s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.70s time: 520.65s eta: 3 days, 12:10:17
|
| 280 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.7GB text_tokens: 31315.0 tgs: 60 data_time: 1.22s time: 519.93s eta: 3 days, 11:54:40
|
| 282 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32182.0 tgs: 61 data_time: 0.63s time: 521.29s eta: 3 days, 11:59:07
|
| 284 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31862.0 tgs: 60 data_time: 0.74s time: 524.13s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 31743.0 tgs: 60 data_time: 0.88s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32304.0 tgs: 62 data_time: 0.84s time: 518.83s eta: 3 days, 11:09:22
|
| 290 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 0.89s time: 522.11s eta: 3 days, 11:32:12
|
| 292 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.6GB text_tokens: 30695.0 tgs: 58 data_time: 0.88s time: 523.83s eta: 3 days, 11:40:01
|
| 294 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.2GB text_tokens: 30721.0 tgs: 59 data_time: 0.97s time: 520.46s eta: 3 days, 10:59:04
|
| 296 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 29162.0 tgs: 56 data_time: 0.58s time: 518.43s eta: 3 days, 10:31:01
|
| 298 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 31808.0 tgs: 60 data_time: 0.81s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32435.0 tgs: 61 data_time: 1.02s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32461.0 tgs: 62 data_time: 0.79s time: 520.96s eta: 3 days, 10:29:09
|
| 304 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.8GB text_tokens: 31514.0 tgs: 60 data_time: 0.99s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 30176.0 tgs: 57 data_time: 0.56s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32145.0 tgs: 61 data_time: 1.00s time: 522.93s eta: 3 days, 10:21:43
|
| 310 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32029.0 tgs: 61 data_time: 0.58s time: 521.32s eta: 3 days, 9:57:44
|
| 312 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31383.0 tgs: 60 data_time: 0.64s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.4GB text_tokens: 30895.0 tgs: 59 data_time: 0.77s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31598.0 tgs: 60 data_time: 1.06s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32406.0 tgs: 62 data_time: 0.78s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 32322.0 tgs: 62 data_time: 0.83s time: 520.68s eta: 3 days, 9:08:19
|
| 322 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 30969.0 tgs: 59 data_time: 0.85s time: 521.30s eta: 3 days, 9:05:29
|
| 324 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.8GB text_tokens: 31334.0 tgs: 59 data_time: 0.89s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31952.0 tgs: 61 data_time: 0.83s time: 520.96s eta: 3 days, 8:44:55
|
| 328 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.212 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 32127.0 tgs: 61 data_time: 0.90s time: 520.15s eta: 3 days, 8:28:43
|
| 330 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.9GB text_tokens: 31858.0 tgs: 60 data_time: 0.82s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 31386.0 tgs: 59 data_time: 0.84s time: 524.34s eta: 3 days, 8:50:07
|
| 334 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.355 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 32250.0 tgs: 62 data_time: 0.70s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32507.0 tgs: 62 data_time: 1.06s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31757.0 tgs: 60 data_time: 0.67s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.8GB text_tokens: 31888.0 tgs: 60 data_time: 0.97s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31891.0 tgs: 61 data_time: 0.80s time: 520.51s eta: 3 days, 7:31:18
|
| 344 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31410.0 tgs: 60 data_time: 0.69s time: 518.56s eta: 3 days, 7:04:51
|
| 346 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31763.0 tgs: 60 data_time: 0.97s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 30226.0 tgs: 57 data_time: 0.89s time: 523.62s eta: 3 days, 7:33:41
|
| 350 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31914.0 tgs: 61 data_time: 0.86s time: 520.17s eta: 3 days, 6:53:35
|
| 352 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 31209.0 tgs: 60 data_time: 0.49s time: 520.06s eta: 3 days, 6:43:49
|
| 354 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 31712.0 tgs: 60 data_time: 0.75s time: 523.76s eta: 3 days, 7:08:44
|
| 356 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31541.0 tgs: 60 data_time: 0.79s time: 523.78s eta: 3 days, 7:00:13
|
| 358 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32580.0 tgs: 62 data_time: 0.79s time: 520.24s eta: 3 days, 6:19:29
|
| 360 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.7GB text_tokens: 31793.0 tgs: 61 data_time: 0.57s time: 518.78s eta: 3 days, 5:57:37
|
| 362 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.8GB text_tokens: 31726.0 tgs: 60 data_time: 0.64s time: 522.68s eta: 3 days, 6:24:09
|
| 364 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.380 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 32291.0 tgs: 61 data_time: 0.85s time: 522.73s eta: 3 days, 6:15:51
|
| 366 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31197.0 tgs: 59 data_time: 0.78s time: 520.53s eta: 3 days, 5:47:23
|
| 368 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.7GB text_tokens: 31650.0 tgs: 60 data_time: 0.90s time: 519.66s eta: 3 days, 5:30:56
|
| 370 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.0GB text_tokens: 31693.0 tgs: 60 data_time: 0.78s time: 521.30s eta: 3 days, 5:36:54
|
| 372 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 30935.0 tgs: 59 data_time: 0.82s time: 523.59s eta: 3 days, 5:48:40
|
| 374 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31373.0 tgs: 60 data_time: 0.75s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31466.0 tgs: 60 data_time: 0.72s time: 520.65s eta: 3 days, 5:05:08
|
| 378 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31582.0 tgs: 60 data_time: 0.75s time: 521.45s eta: 3 days, 5:03:31
|
| 380 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31379.0 tgs: 59 data_time: 0.79s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 31834.0 tgs: 61 data_time: 0.50s time: 518.85s eta: 3 days, 4:23:10
|
| 384 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31863.0 tgs: 61 data_time: 0.64s time: 519.24s eta: 3 days, 4:17:58
|
| 386 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31055.0 tgs: 59 data_time: 0.65s time: 520.71s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31953.0 tgs: 60 data_time: 0.75s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32157.0 tgs: 61 data_time: 0.60s time: 520.33s eta: 3 days, 4:01:33
|
| 392 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 32.7GB text_tokens: 31301.0 tgs: 60 data_time: 0.73s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 11][DP 2][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31423.0 tgs: 60 data_time: 0.46s time: 522.32s eta: 3 days, 4:01:34
|
20250120235238/rank13.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.14s
|
| 12 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.91 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.3GB text_tokens: 30603.0 tgs: 55 data_time: 1.93s time: 547.92s eta: 3 days, 18:15:15
|
| 258 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.7GB text_tokens: 31849.0 tgs: 60 data_time: 0.70s time: 523.24s eta: 3 days, 14:02:38
|
| 260 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.5GB text_tokens: 31059.0 tgs: 59 data_time: 0.91s time: 522.88s eta: 3 days, 13:50:21
|
| 262 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32411.0 tgs: 62 data_time: 0.83s time: 520.29s eta: 3 days, 13:16:11
|
| 264 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31204.0 tgs: 59 data_time: 0.65s time: 520.98s eta: 3 days, 13:14:19
|
| 266 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.9GB text_tokens: 31449.0 tgs: 60 data_time: 0.83s time: 520.90s eta: 3 days, 13:04:46
|
| 268 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31590.0 tgs: 60 data_time: 0.80s time: 523.38s eta: 3 days, 13:20:26
|
| 270 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.9GB text_tokens: 31967.0 tgs: 61 data_time: 0.77s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 31084.0 tgs: 59 data_time: 0.90s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31694.0 tgs: 60 data_time: 0.85s time: 520.42s eta: 3 days, 12:25:22
|
| 276 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32132.0 tgs: 61 data_time: 0.93s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31935.0 tgs: 61 data_time: 0.76s time: 520.65s eta: 3 days, 12:10:17
|
| 280 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 31576.0 tgs: 60 data_time: 0.81s time: 519.93s eta: 3 days, 11:54:39
|
| 282 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31631.0 tgs: 60 data_time: 0.78s time: 521.29s eta: 3 days, 11:59:07
|
| 284 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31904.0 tgs: 60 data_time: 0.82s time: 524.12s eta: 3 days, 12:17:47
|
| 286 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32061.0 tgs: 61 data_time: 0.81s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.6GB text_tokens: 30810.0 tgs: 59 data_time: 0.70s time: 518.83s eta: 3 days, 11:09:22
|
| 290 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31204.0 tgs: 59 data_time: 0.89s time: 522.11s eta: 3 days, 11:32:12
|
| 292 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.6GB text_tokens: 31557.0 tgs: 60 data_time: 1.16s time: 523.83s eta: 3 days, 11:40:02
|
| 294 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31964.0 tgs: 61 data_time: 0.71s time: 520.46s eta: 3 days, 10:59:03
|
| 296 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 31859.0 tgs: 61 data_time: 0.67s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 32340.0 tgs: 61 data_time: 0.87s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 32.7GB text_tokens: 31453.0 tgs: 60 data_time: 0.47s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31779.0 tgs: 61 data_time: 0.60s time: 520.96s eta: 3 days, 10:29:09
|
| 304 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.4GB text_tokens: 31169.0 tgs: 60 data_time: 0.79s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 29708.0 tgs: 56 data_time: 0.90s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.5GB text_tokens: 31114.0 tgs: 59 data_time: 1.01s time: 522.93s eta: 3 days, 10:21:43
|
| 310 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32372.0 tgs: 62 data_time: 0.94s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31554.0 tgs: 60 data_time: 0.94s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.5GB text_tokens: 31042.0 tgs: 59 data_time: 1.05s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.335 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31728.0 tgs: 60 data_time: 0.81s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.349 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.7GB text_tokens: 31816.0 tgs: 61 data_time: 0.73s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 30775.0 tgs: 59 data_time: 0.64s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31569.0 tgs: 60 data_time: 0.48s time: 521.30s eta: 3 days, 9:05:29
|
| 324 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31161.0 tgs: 59 data_time: 0.68s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31655.0 tgs: 60 data_time: 0.97s time: 520.96s eta: 3 days, 8:44:54
|
| 328 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31369.0 tgs: 60 data_time: 0.79s time: 520.15s eta: 3 days, 8:28:44
|
| 330 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32447.0 tgs: 62 data_time: 0.83s time: 522.56s eta: 3 days, 8:42:24
|
| 332 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31783.0 tgs: 60 data_time: 0.91s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.7GB text_tokens: 30915.0 tgs: 59 data_time: 0.73s time: 519.88s eta: 3 days, 8:00:16
|
| 336 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 30941.0 tgs: 59 data_time: 0.75s time: 520.11s eta: 3 days, 7:53:42
|
| 338 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31815.0 tgs: 60 data_time: 0.54s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32295.0 tgs: 61 data_time: 0.66s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.9GB text_tokens: 32051.0 tgs: 61 data_time: 0.62s time: 520.51s eta: 3 days, 7:31:18
|
| 344 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 32370.0 tgs: 62 data_time: 0.69s time: 518.56s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31953.0 tgs: 61 data_time: 0.53s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31767.0 tgs: 60 data_time: 0.67s time: 523.62s eta: 3 days, 7:33:41
|
| 350 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 31980.0 tgs: 61 data_time: 0.60s time: 520.17s eta: 3 days, 6:53:35
|
| 352 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 0.85s time: 520.05s eta: 3 days, 6:43:49
|
| 354 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31824.0 tgs: 60 data_time: 0.67s time: 523.76s eta: 3 days, 7:08:44
|
| 356 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.6GB text_tokens: 30092.0 tgs: 57 data_time: 0.76s time: 523.78s eta: 3 days, 7:00:13
|
| 358 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31783.0 tgs: 61 data_time: 0.71s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.9GB text_tokens: 30978.0 tgs: 59 data_time: 0.72s time: 518.78s eta: 3 days, 5:57:37
|
| 362 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 31772.0 tgs: 60 data_time: 0.78s time: 522.69s eta: 3 days, 6:24:09
|
| 364 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31322.0 tgs: 59 data_time: 1.04s time: 522.73s eta: 3 days, 6:15:52
|
| 366 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.6GB text_tokens: 31440.0 tgs: 60 data_time: 0.72s time: 520.53s eta: 3 days, 5:47:23
|
| 368 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31790.0 tgs: 61 data_time: 0.84s time: 519.66s eta: 3 days, 5:30:55
|
| 370 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32257.0 tgs: 61 data_time: 0.76s time: 521.29s eta: 3 days, 5:36:54
|
| 372 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 31778.0 tgs: 60 data_time: 0.71s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 31886.0 tgs: 61 data_time: 0.98s time: 519.32s eta: 3 days, 5:01:57
|
| 376 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 32.9GB text_tokens: 31703.0 tgs: 60 data_time: 0.82s time: 520.65s eta: 3 days, 5:05:07
|
| 378 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 31729.0 tgs: 60 data_time: 0.96s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.1GB text_tokens: 32208.0 tgs: 61 data_time: 0.79s time: 523.94s eta: 3 days, 5:16:54
|
| 382 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 31951.0 tgs: 61 data_time: 0.92s time: 518.85s eta: 3 days, 4:23:11
|
| 384 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 31537.0 tgs: 60 data_time: 0.56s time: 519.24s eta: 3 days, 4:17:58
|
| 386 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31597.0 tgs: 60 data_time: 0.75s time: 520.71s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.8GB text_tokens: 31472.0 tgs: 60 data_time: 0.79s time: 524.25s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 31847.0 tgs: 61 data_time: 0.89s time: 520.33s eta: 3 days, 4:01:32
|
| 392 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32423.0 tgs: 62 data_time: 0.91s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 13][DP 3][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 32055.0 tgs: 61 data_time: 0.96s time: 522.32s eta: 3 days, 4:01:33
|
20250120235238/rank15.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.22s
|
| 12 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.91 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.3GB text_tokens: 30603.0 tgs: 55 data_time: 1.99s time: 547.92s eta: 3 days, 18:15:15
|
| 258 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.7GB text_tokens: 31849.0 tgs: 60 data_time: 0.71s time: 523.25s eta: 3 days, 14:02:41
|
| 260 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.5GB text_tokens: 31059.0 tgs: 59 data_time: 0.94s time: 522.88s eta: 3 days, 13:50:21
|
| 262 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32411.0 tgs: 62 data_time: 0.86s time: 520.29s eta: 3 days, 13:16:11
|
| 264 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31204.0 tgs: 59 data_time: 0.69s time: 520.98s eta: 3 days, 13:14:19
|
| 266 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.9GB text_tokens: 31449.0 tgs: 60 data_time: 0.86s time: 520.90s eta: 3 days, 13:04:46
|
| 268 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31590.0 tgs: 60 data_time: 0.83s time: 523.38s eta: 3 days, 13:20:26
|
| 270 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.9GB text_tokens: 31967.0 tgs: 61 data_time: 0.79s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.394 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 31084.0 tgs: 59 data_time: 0.92s time: 520.18s eta: 3 days, 12:31:43
|
| 274 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.215 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31694.0 tgs: 60 data_time: 0.87s time: 520.41s eta: 3 days, 12:25:22
|
| 276 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32132.0 tgs: 61 data_time: 0.94s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31935.0 tgs: 61 data_time: 0.80s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 31576.0 tgs: 60 data_time: 0.83s time: 519.93s eta: 3 days, 11:54:36
|
| 282 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31631.0 tgs: 60 data_time: 0.82s time: 521.29s eta: 3 days, 11:59:06
|
| 284 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31904.0 tgs: 60 data_time: 0.85s time: 524.13s eta: 3 days, 12:17:49
|
| 286 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32061.0 tgs: 61 data_time: 0.84s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.6GB text_tokens: 30810.0 tgs: 59 data_time: 0.73s time: 518.83s eta: 3 days, 11:09:22
|
| 290 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31204.0 tgs: 59 data_time: 0.96s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.6GB text_tokens: 31557.0 tgs: 60 data_time: 1.18s time: 523.83s eta: 3 days, 11:40:01
|
| 294 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31964.0 tgs: 61 data_time: 0.73s time: 520.46s eta: 3 days, 10:59:04
|
| 296 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 31859.0 tgs: 61 data_time: 0.69s time: 518.43s eta: 3 days, 10:31:01
|
| 298 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 32340.0 tgs: 61 data_time: 0.89s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 32.7GB text_tokens: 31453.0 tgs: 60 data_time: 0.48s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31779.0 tgs: 61 data_time: 0.61s time: 520.96s eta: 3 days, 10:29:09
|
| 304 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.4GB text_tokens: 31169.0 tgs: 60 data_time: 0.82s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 29708.0 tgs: 56 data_time: 0.96s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.5GB text_tokens: 31114.0 tgs: 59 data_time: 1.03s time: 522.93s eta: 3 days, 10:21:43
|
| 310 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.317 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32372.0 tgs: 62 data_time: 0.94s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31554.0 tgs: 60 data_time: 0.97s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.5GB text_tokens: 31042.0 tgs: 59 data_time: 1.07s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31728.0 tgs: 60 data_time: 0.84s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.7GB text_tokens: 31816.0 tgs: 61 data_time: 0.73s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 30775.0 tgs: 59 data_time: 0.66s time: 520.68s eta: 3 days, 9:08:19
|
| 322 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31569.0 tgs: 60 data_time: 0.50s time: 521.30s eta: 3 days, 9:05:28
|
| 324 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31161.0 tgs: 59 data_time: 0.70s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31655.0 tgs: 60 data_time: 0.99s time: 520.96s eta: 3 days, 8:44:54
|
| 328 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31369.0 tgs: 60 data_time: 0.82s time: 520.15s eta: 3 days, 8:28:43
|
| 330 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.208 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32447.0 tgs: 62 data_time: 0.84s time: 522.56s eta: 3 days, 8:42:24
|
| 332 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31783.0 tgs: 60 data_time: 0.93s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.7GB text_tokens: 30915.0 tgs: 59 data_time: 0.76s time: 519.89s eta: 3 days, 8:00:16
|
| 336 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 30941.0 tgs: 59 data_time: 0.75s time: 520.11s eta: 3 days, 7:53:42
|
| 338 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31815.0 tgs: 60 data_time: 0.58s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32295.0 tgs: 61 data_time: 0.68s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.9GB text_tokens: 32051.0 tgs: 61 data_time: 0.63s time: 520.50s eta: 3 days, 7:31:17
|
| 344 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 32370.0 tgs: 62 data_time: 0.72s time: 518.56s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31953.0 tgs: 61 data_time: 0.55s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31767.0 tgs: 60 data_time: 0.68s time: 523.62s eta: 3 days, 7:33:41
|
| 350 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 31980.0 tgs: 61 data_time: 0.62s time: 520.18s eta: 3 days, 6:53:35
|
| 352 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 0.86s time: 520.05s eta: 3 days, 6:43:49
|
| 354 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31824.0 tgs: 60 data_time: 0.65s time: 523.76s eta: 3 days, 7:08:44
|
| 356 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.6GB text_tokens: 30092.0 tgs: 57 data_time: 0.77s time: 523.78s eta: 3 days, 7:00:12
|
| 358 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.334 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31783.0 tgs: 61 data_time: 0.73s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.9GB text_tokens: 30978.0 tgs: 59 data_time: 0.73s time: 518.78s eta: 3 days, 5:57:37
|
| 362 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 31772.0 tgs: 60 data_time: 0.79s time: 522.69s eta: 3 days, 6:24:09
|
| 364 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31322.0 tgs: 59 data_time: 1.04s time: 522.73s eta: 3 days, 6:15:51
|
| 366 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.333 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.6GB text_tokens: 31440.0 tgs: 60 data_time: 0.73s time: 520.53s eta: 3 days, 5:47:22
|
| 368 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31790.0 tgs: 61 data_time: 0.86s time: 519.66s eta: 3 days, 5:30:56
|
| 370 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32257.0 tgs: 61 data_time: 0.77s time: 521.30s eta: 3 days, 5:36:54
|
| 372 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 31778.0 tgs: 60 data_time: 0.73s time: 523.59s eta: 3 days, 5:48:40
|
| 374 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.316 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 31886.0 tgs: 61 data_time: 1.01s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 32.9GB text_tokens: 31703.0 tgs: 60 data_time: 0.84s time: 520.65s eta: 3 days, 5:05:07
|
| 378 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 31729.0 tgs: 60 data_time: 0.97s time: 521.45s eta: 3 days, 5:03:31
|
| 380 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.342 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.1GB text_tokens: 32208.0 tgs: 61 data_time: 0.80s time: 523.94s eta: 3 days, 5:16:54
|
| 382 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 31951.0 tgs: 61 data_time: 0.94s time: 518.85s eta: 3 days, 4:23:11
|
| 384 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 31537.0 tgs: 60 data_time: 0.58s time: 519.24s eta: 3 days, 4:17:57
|
| 386 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31597.0 tgs: 60 data_time: 0.78s time: 520.71s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.8GB text_tokens: 31472.0 tgs: 60 data_time: 0.80s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 31847.0 tgs: 61 data_time: 0.91s time: 520.33s eta: 3 days, 4:01:33
|
| 392 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32423.0 tgs: 62 data_time: 0.93s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 15][DP 3][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 32055.0 tgs: 61 data_time: 0.97s time: 522.32s eta: 3 days, 4:01:34
|
20250120235238/rank17.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.17s
|
| 12 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.27 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31743.0 tgs: 58 data_time: 1.81s time: 547.02s eta: 3 days, 18:06:20
|
| 258 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.0GB text_tokens: 32110.0 tgs: 61 data_time: 0.92s time: 523.24s eta: 3 days, 14:02:39
|
| 260 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.6GB text_tokens: 31536.0 tgs: 60 data_time: 0.84s time: 522.89s eta: 3 days, 13:50:27
|
| 262 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31549.0 tgs: 60 data_time: 1.03s time: 520.29s eta: 3 days, 13:16:11
|
| 264 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31125.0 tgs: 59 data_time: 0.85s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.7GB text_tokens: 31488.0 tgs: 60 data_time: 1.13s time: 520.87s eta: 3 days, 13:04:34
|
| 268 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 30800.0 tgs: 58 data_time: 0.76s time: 523.38s eta: 3 days, 13:20:26
|
| 270 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32301.0 tgs: 62 data_time: 0.94s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31885.0 tgs: 61 data_time: 0.92s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 32246.0 tgs: 61 data_time: 0.83s time: 520.43s eta: 3 days, 12:25:31
|
| 276 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 32255.0 tgs: 61 data_time: 0.79s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31976.0 tgs: 61 data_time: 0.86s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 32057.0 tgs: 61 data_time: 0.72s time: 519.92s eta: 3 days, 11:54:35
|
| 282 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31783.0 tgs: 60 data_time: 0.79s time: 521.29s eta: 3 days, 11:59:09
|
| 284 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 30276.0 tgs: 57 data_time: 0.68s time: 524.12s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 32240.0 tgs: 61 data_time: 0.74s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32111.0 tgs: 61 data_time: 0.89s time: 518.81s eta: 3 days, 11:09:11
|
| 290 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 30872.0 tgs: 59 data_time: 0.75s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31286.0 tgs: 59 data_time: 0.96s time: 523.83s eta: 3 days, 11:40:02
|
| 294 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31340.0 tgs: 60 data_time: 0.68s time: 520.47s eta: 3 days, 10:59:11
|
| 296 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31187.0 tgs: 60 data_time: 0.86s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.7GB text_tokens: 31884.0 tgs: 60 data_time: 0.89s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32216.0 tgs: 61 data_time: 0.86s time: 523.51s eta: 3 days, 11:02:04
|
| 302 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 31832.0 tgs: 61 data_time: 0.64s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31974.0 tgs: 61 data_time: 0.93s time: 519.10s eta: 3 days, 10:02:49
|
| 306 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31429.0 tgs: 60 data_time: 0.74s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31130.0 tgs: 59 data_time: 0.93s time: 522.95s eta: 3 days, 10:21:50
|
| 310 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.6GB text_tokens: 30936.0 tgs: 59 data_time: 0.57s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31633.0 tgs: 60 data_time: 0.68s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 31970.0 tgs: 61 data_time: 0.97s time: 522.40s eta: 3 days, 9:50:32
|
| 316 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.210 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.8GB text_tokens: 25656.0 tgs: 48 data_time: 0.79s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31702.0 tgs: 60 data_time: 0.87s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.8GB text_tokens: 31282.0 tgs: 60 data_time: 0.65s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31475.0 tgs: 60 data_time: 0.54s time: 521.29s eta: 3 days, 9:05:20
|
| 324 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 31004.0 tgs: 59 data_time: 0.90s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31063.0 tgs: 59 data_time: 0.88s time: 520.96s eta: 3 days, 8:44:56
|
| 328 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.8GB text_tokens: 31013.0 tgs: 59 data_time: 0.89s time: 520.16s eta: 3 days, 8:28:50
|
| 330 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.355 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32430.0 tgs: 62 data_time: 0.75s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32259.0 tgs: 61 data_time: 0.56s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.9GB text_tokens: 31282.0 tgs: 60 data_time: 0.81s time: 519.89s eta: 3 days, 8:00:16
|
| 336 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 31847.0 tgs: 61 data_time: 0.72s time: 520.10s eta: 3 days, 7:53:33
|
| 338 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32064.0 tgs: 61 data_time: 0.55s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 30877.0 tgs: 58 data_time: 0.57s time: 524.35s eta: 3 days, 8:15:18
|
| 342 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31532.0 tgs: 60 data_time: 0.66s time: 520.51s eta: 3 days, 7:31:20
|
| 344 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.8GB text_tokens: 31384.0 tgs: 60 data_time: 1.06s time: 518.57s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31932.0 tgs: 61 data_time: 0.90s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31103.0 tgs: 59 data_time: 0.67s time: 523.62s eta: 3 days, 7:33:42
|
| 350 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.0GB text_tokens: 32490.0 tgs: 62 data_time: 0.93s time: 520.16s eta: 3 days, 6:53:27
|
| 352 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 0.92s time: 520.05s eta: 3 days, 6:43:49
|
| 354 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31721.0 tgs: 60 data_time: 0.93s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.9GB text_tokens: 31830.0 tgs: 60 data_time: 0.81s time: 523.77s eta: 3 days, 7:00:04
|
| 358 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.9GB text_tokens: 30994.0 tgs: 59 data_time: 0.73s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31322.0 tgs: 60 data_time: 0.86s time: 518.78s eta: 3 days, 5:57:38
|
| 362 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 32082.0 tgs: 61 data_time: 0.77s time: 522.69s eta: 3 days, 6:24:11
|
| 364 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31833.0 tgs: 60 data_time: 0.77s time: 522.74s eta: 3 days, 6:15:57
|
| 366 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31319.0 tgs: 60 data_time: 0.85s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 29936.0 tgs: 57 data_time: 0.94s time: 519.66s eta: 3 days, 5:30:55
|
| 370 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.0GB text_tokens: 32364.0 tgs: 62 data_time: 0.87s time: 521.28s eta: 3 days, 5:36:47
|
| 372 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.8GB text_tokens: 31346.0 tgs: 59 data_time: 0.68s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31934.0 tgs: 61 data_time: 0.78s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32223.0 tgs: 61 data_time: 0.82s time: 520.65s eta: 3 days, 5:05:07
|
| 378 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31650.0 tgs: 60 data_time: 0.80s time: 521.45s eta: 3 days, 5:03:31
|
| 380 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 32139.0 tgs: 61 data_time: 0.75s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 30916.0 tgs: 59 data_time: 0.63s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31094.0 tgs: 59 data_time: 0.95s time: 519.25s eta: 3 days, 4:18:02
|
| 386 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31963.0 tgs: 61 data_time: 0.83s time: 520.71s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31710.0 tgs: 60 data_time: 0.88s time: 524.26s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32386.0 tgs: 62 data_time: 0.93s time: 520.29s eta: 3 days, 4:01:13
|
| 392 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32507.0 tgs: 62 data_time: 0.75s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 17][DP 4][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31766.0 tgs: 60 data_time: 0.59s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank18.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.71 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31743.0 tgs: 58 data_time: 1.77s time: 547.16s eta: 3 days, 18:07:47
|
| 258 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.335 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.0GB text_tokens: 32110.0 tgs: 61 data_time: 0.88s time: 523.25s eta: 3 days, 14:02:43
|
| 260 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.6GB text_tokens: 31536.0 tgs: 60 data_time: 0.82s time: 522.89s eta: 3 days, 13:50:27
|
| 262 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31549.0 tgs: 60 data_time: 1.03s time: 520.29s eta: 3 days, 13:16:12
|
| 264 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31125.0 tgs: 59 data_time: 0.84s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.7GB text_tokens: 31488.0 tgs: 60 data_time: 1.09s time: 520.87s eta: 3 days, 13:04:34
|
| 268 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 30800.0 tgs: 58 data_time: 0.74s time: 523.39s eta: 3 days, 13:20:27
|
| 270 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32301.0 tgs: 62 data_time: 0.91s time: 520.66s eta: 3 days, 12:45:09
|
| 272 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31885.0 tgs: 61 data_time: 0.90s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 32246.0 tgs: 61 data_time: 0.78s time: 520.43s eta: 3 days, 12:25:32
|
| 276 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 32255.0 tgs: 61 data_time: 0.76s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31976.0 tgs: 61 data_time: 0.83s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 32057.0 tgs: 61 data_time: 0.71s time: 519.92s eta: 3 days, 11:54:36
|
| 282 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31783.0 tgs: 60 data_time: 0.76s time: 521.29s eta: 3 days, 11:59:07
|
| 284 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 30276.0 tgs: 57 data_time: 0.67s time: 524.13s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 32240.0 tgs: 61 data_time: 0.72s time: 520.54s eta: 3 days, 11:34:33
|
| 288 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32111.0 tgs: 61 data_time: 0.86s time: 518.81s eta: 3 days, 11:09:12
|
| 290 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 30872.0 tgs: 59 data_time: 0.73s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31286.0 tgs: 59 data_time: 0.94s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31340.0 tgs: 60 data_time: 0.67s time: 520.47s eta: 3 days, 10:59:11
|
| 296 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31187.0 tgs: 60 data_time: 0.83s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.7GB text_tokens: 31884.0 tgs: 60 data_time: 0.87s time: 522.84s eta: 3 days, 11:04:25
|
| 300 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32216.0 tgs: 61 data_time: 0.82s time: 523.51s eta: 3 days, 11:02:03
|
| 302 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 31832.0 tgs: 61 data_time: 0.61s time: 520.97s eta: 3 days, 10:29:10
|
| 304 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31974.0 tgs: 61 data_time: 0.91s time: 519.09s eta: 3 days, 10:02:44
|
| 306 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31429.0 tgs: 60 data_time: 0.72s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31130.0 tgs: 59 data_time: 0.92s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.6GB text_tokens: 30936.0 tgs: 59 data_time: 0.56s time: 521.32s eta: 3 days, 9:57:46
|
| 312 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31633.0 tgs: 60 data_time: 0.66s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 31970.0 tgs: 61 data_time: 0.94s time: 522.40s eta: 3 days, 9:50:32
|
| 316 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.8GB text_tokens: 25656.0 tgs: 48 data_time: 0.77s time: 523.75s eta: 3 days, 9:54:33
|
| 318 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31702.0 tgs: 60 data_time: 0.84s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.8GB text_tokens: 31282.0 tgs: 60 data_time: 0.63s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31475.0 tgs: 60 data_time: 0.52s time: 521.29s eta: 3 days, 9:05:21
|
| 324 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 31004.0 tgs: 59 data_time: 0.85s time: 524.11s eta: 3 days, 9:22:58
|
| 326 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31063.0 tgs: 59 data_time: 0.86s time: 520.96s eta: 3 days, 8:44:55
|
| 328 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.8GB text_tokens: 31013.0 tgs: 59 data_time: 0.88s time: 520.17s eta: 3 days, 8:28:52
|
| 330 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32430.0 tgs: 62 data_time: 0.73s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32259.0 tgs: 61 data_time: 0.55s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.9GB text_tokens: 31282.0 tgs: 60 data_time: 0.79s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 31847.0 tgs: 61 data_time: 0.70s time: 520.10s eta: 3 days, 7:53:34
|
| 338 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32064.0 tgs: 61 data_time: 0.54s time: 522.00s eta: 3 days, 8:02:24
|
| 340 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 30877.0 tgs: 58 data_time: 0.54s time: 524.35s eta: 3 days, 8:15:18
|
| 342 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31532.0 tgs: 60 data_time: 0.65s time: 520.51s eta: 3 days, 7:31:20
|
| 344 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.8GB text_tokens: 31384.0 tgs: 60 data_time: 1.04s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31932.0 tgs: 61 data_time: 0.89s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31103.0 tgs: 59 data_time: 0.65s time: 523.63s eta: 3 days, 7:33:42
|
| 350 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.0GB text_tokens: 32490.0 tgs: 62 data_time: 0.91s time: 520.16s eta: 3 days, 6:53:28
|
| 352 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 0.91s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.339 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31721.0 tgs: 60 data_time: 0.92s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.9GB text_tokens: 31830.0 tgs: 60 data_time: 0.81s time: 523.77s eta: 3 days, 7:00:05
|
| 358 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.9GB text_tokens: 30994.0 tgs: 59 data_time: 0.72s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31322.0 tgs: 60 data_time: 0.89s time: 518.78s eta: 3 days, 5:57:39
|
| 362 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 32082.0 tgs: 61 data_time: 0.75s time: 522.69s eta: 3 days, 6:24:10
|
| 364 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31833.0 tgs: 60 data_time: 0.75s time: 522.74s eta: 3 days, 6:15:58
|
| 366 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31319.0 tgs: 60 data_time: 0.84s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 29936.0 tgs: 57 data_time: 0.92s time: 519.66s eta: 3 days, 5:30:56
|
| 370 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.0GB text_tokens: 32364.0 tgs: 62 data_time: 0.86s time: 521.28s eta: 3 days, 5:36:48
|
| 372 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.8GB text_tokens: 31346.0 tgs: 59 data_time: 0.66s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31934.0 tgs: 61 data_time: 0.77s time: 519.32s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32223.0 tgs: 61 data_time: 0.81s time: 520.65s eta: 3 days, 5:05:07
|
| 378 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31650.0 tgs: 60 data_time: 0.79s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 32139.0 tgs: 61 data_time: 0.75s time: 523.95s eta: 3 days, 5:16:56
|
| 382 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 30916.0 tgs: 59 data_time: 0.62s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31094.0 tgs: 59 data_time: 0.91s time: 519.25s eta: 3 days, 4:18:03
|
| 386 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31963.0 tgs: 61 data_time: 0.83s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31710.0 tgs: 60 data_time: 0.86s time: 524.25s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32386.0 tgs: 62 data_time: 0.92s time: 520.29s eta: 3 days, 4:01:14
|
| 392 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32507.0 tgs: 62 data_time: 0.74s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 18][DP 4][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31766.0 tgs: 60 data_time: 0.59s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank21.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.15s
|
| 12 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.70 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.3GB text_tokens: 29620.0 tgs: 54 data_time: 1.87s time: 547.04s eta: 3 days, 18:06:36
|
| 258 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32369.0 tgs: 61 data_time: 0.80s time: 523.24s eta: 3 days, 14:02:39
|
| 260 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32546.0 tgs: 62 data_time: 0.79s time: 522.89s eta: 3 days, 13:50:26
|
| 262 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 32411.0 tgs: 62 data_time: 1.04s time: 520.29s eta: 3 days, 13:16:12
|
| 264 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 30927.0 tgs: 59 data_time: 0.90s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 32427.0 tgs: 62 data_time: 1.18s time: 520.88s eta: 3 days, 13:04:34
|
| 268 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31527.0 tgs: 60 data_time: 1.10s time: 523.38s eta: 3 days, 13:20:26
|
| 270 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.331 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31799.0 tgs: 61 data_time: 0.68s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 30776.0 tgs: 59 data_time: 0.71s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 31699.0 tgs: 60 data_time: 0.72s time: 520.43s eta: 3 days, 12:25:31
|
| 276 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.8GB text_tokens: 31472.0 tgs: 59 data_time: 0.98s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31345.0 tgs: 60 data_time: 0.67s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.332 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 30795.0 tgs: 59 data_time: 0.76s time: 519.92s eta: 3 days, 11:54:35
|
| 282 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31328.0 tgs: 60 data_time: 0.73s time: 521.29s eta: 3 days, 11:59:06
|
| 284 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32265.0 tgs: 61 data_time: 0.83s time: 524.13s eta: 3 days, 12:17:51
|
| 286 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.8GB text_tokens: 29937.0 tgs: 57 data_time: 0.77s time: 520.54s eta: 3 days, 11:34:29
|
| 288 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 30786.0 tgs: 59 data_time: 0.74s time: 518.81s eta: 3 days, 11:09:12
|
| 290 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31670.0 tgs: 60 data_time: 0.82s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31374.0 tgs: 59 data_time: 0.61s time: 523.83s eta: 3 days, 11:40:02
|
| 294 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32080.0 tgs: 61 data_time: 0.74s time: 520.47s eta: 3 days, 10:59:11
|
| 296 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 30100.0 tgs: 58 data_time: 0.76s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.8GB text_tokens: 31525.0 tgs: 60 data_time: 0.87s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.383 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32200.0 tgs: 61 data_time: 0.89s time: 523.51s eta: 3 days, 11:02:03
|
| 302 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 31533.0 tgs: 60 data_time: 1.02s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31572.0 tgs: 60 data_time: 0.76s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31711.0 tgs: 60 data_time: 0.56s time: 523.29s eta: 3 days, 10:33:50
|
| 308 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.7GB text_tokens: 31027.0 tgs: 59 data_time: 0.78s time: 522.95s eta: 3 days, 10:21:50
|
| 310 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.8GB text_tokens: 31174.0 tgs: 59 data_time: 1.02s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31786.0 tgs: 61 data_time: 0.89s time: 520.64s eta: 3 days, 9:42:40
|
| 314 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.9GB text_tokens: 31831.0 tgs: 60 data_time: 0.91s time: 522.40s eta: 3 days, 9:50:31
|
| 316 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.9GB text_tokens: 31822.0 tgs: 60 data_time: 0.86s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 31024.0 tgs: 59 data_time: 0.75s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.1GB text_tokens: 31792.0 tgs: 61 data_time: 1.00s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.209 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.7GB text_tokens: 31319.0 tgs: 60 data_time: 0.62s time: 521.29s eta: 3 days, 9:05:20
|
| 324 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32256.0 tgs: 61 data_time: 0.63s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32359.0 tgs: 62 data_time: 0.87s time: 520.96s eta: 3 days, 8:44:54
|
| 328 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.7GB text_tokens: 31433.0 tgs: 60 data_time: 0.89s time: 520.16s eta: 3 days, 8:28:51
|
| 330 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 30457.0 tgs: 58 data_time: 0.86s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.205 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 32061.0 tgs: 61 data_time: 0.85s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.9GB text_tokens: 30958.0 tgs: 59 data_time: 0.68s time: 519.89s eta: 3 days, 8:00:16
|
| 336 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32067.0 tgs: 61 data_time: 0.85s time: 520.10s eta: 3 days, 7:53:34
|
| 338 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31896.0 tgs: 61 data_time: 0.93s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.8GB text_tokens: 30763.0 tgs: 58 data_time: 0.74s time: 524.35s eta: 3 days, 8:15:18
|
| 342 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.9GB text_tokens: 31487.0 tgs: 60 data_time: 0.88s time: 520.51s eta: 3 days, 7:31:19
|
| 344 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 31702.0 tgs: 61 data_time: 0.71s time: 518.56s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.9GB text_tokens: 31920.0 tgs: 61 data_time: 0.91s time: 523.16s eta: 3 days, 7:38:11
|
| 348 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31789.0 tgs: 60 data_time: 0.75s time: 523.62s eta: 3 days, 7:33:40
|
| 350 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31065.0 tgs: 59 data_time: 0.74s time: 520.16s eta: 3 days, 6:53:27
|
| 352 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 32174.0 tgs: 61 data_time: 0.77s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31306.0 tgs: 59 data_time: 1.02s time: 523.76s eta: 3 days, 7:08:44
|
| 356 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 29824.0 tgs: 56 data_time: 0.68s time: 523.77s eta: 3 days, 7:00:05
|
| 358 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.7GB text_tokens: 31180.0 tgs: 59 data_time: 1.08s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.0GB text_tokens: 31929.0 tgs: 61 data_time: 0.88s time: 518.78s eta: 3 days, 5:57:37
|
| 362 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32302.0 tgs: 61 data_time: 0.55s time: 522.68s eta: 3 days, 6:24:09
|
| 364 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31344.0 tgs: 59 data_time: 0.68s time: 522.74s eta: 3 days, 6:15:59
|
| 366 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.334 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 30272.0 tgs: 58 data_time: 0.78s time: 520.52s eta: 3 days, 5:47:22
|
| 368 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32061.0 tgs: 61 data_time: 0.69s time: 519.66s eta: 3 days, 5:30:56
|
| 370 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 31885.0 tgs: 61 data_time: 0.67s time: 521.28s eta: 3 days, 5:36:47
|
| 372 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.1GB text_tokens: 32402.0 tgs: 61 data_time: 1.21s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.4GB text_tokens: 30488.0 tgs: 58 data_time: 0.85s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 31879.0 tgs: 61 data_time: 0.57s time: 520.65s eta: 3 days, 5:05:05
|
| 378 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31778.0 tgs: 60 data_time: 0.58s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.333 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31838.0 tgs: 60 data_time: 0.94s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.352 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32028.0 tgs: 61 data_time: 1.34s time: 518.85s eta: 3 days, 4:23:11
|
| 384 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 30998.0 tgs: 59 data_time: 0.67s time: 519.25s eta: 3 days, 4:18:03
|
| 386 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31047.0 tgs: 59 data_time: 0.88s time: 520.72s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 32302.0 tgs: 61 data_time: 0.86s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.6GB text_tokens: 30866.0 tgs: 59 data_time: 0.61s time: 520.29s eta: 3 days, 4:01:13
|
| 392 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31849.0 tgs: 61 data_time: 0.74s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 21][DP 5][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.363 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31641.0 tgs: 60 data_time: 0.81s time: 522.32s eta: 3 days, 4:01:34
|
20250120235238/rank23.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.14s
|
| 12 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.69 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.3GB text_tokens: 29620.0 tgs: 54 data_time: 1.88s time: 547.16s eta: 3 days, 18:07:48
|
| 258 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32369.0 tgs: 61 data_time: 0.79s time: 523.25s eta: 3 days, 14:02:43
|
| 260 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32546.0 tgs: 62 data_time: 0.78s time: 522.89s eta: 3 days, 13:50:26
|
| 262 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.338 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 32411.0 tgs: 62 data_time: 1.03s time: 520.29s eta: 3 days, 13:16:13
|
| 264 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.319 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 30927.0 tgs: 59 data_time: 0.89s time: 520.98s eta: 3 days, 13:14:19
|
| 266 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 32427.0 tgs: 62 data_time: 1.15s time: 520.87s eta: 3 days, 13:04:34
|
| 268 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31527.0 tgs: 60 data_time: 1.08s time: 523.39s eta: 3 days, 13:20:27
|
| 270 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31799.0 tgs: 61 data_time: 0.65s time: 520.66s eta: 3 days, 12:45:09
|
| 272 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 30776.0 tgs: 59 data_time: 0.70s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 31699.0 tgs: 60 data_time: 0.70s time: 520.43s eta: 3 days, 12:25:32
|
| 276 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.8GB text_tokens: 31472.0 tgs: 59 data_time: 0.97s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31345.0 tgs: 60 data_time: 0.66s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 30795.0 tgs: 59 data_time: 0.73s time: 519.93s eta: 3 days, 11:54:36
|
| 282 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31328.0 tgs: 60 data_time: 0.71s time: 521.29s eta: 3 days, 11:59:08
|
| 284 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32265.0 tgs: 61 data_time: 0.81s time: 524.13s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.8GB text_tokens: 29937.0 tgs: 57 data_time: 0.75s time: 520.54s eta: 3 days, 11:34:32
|
| 288 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.360 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 30786.0 tgs: 59 data_time: 0.74s time: 518.81s eta: 3 days, 11:09:12
|
| 290 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31670.0 tgs: 60 data_time: 0.81s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31374.0 tgs: 59 data_time: 0.56s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32080.0 tgs: 61 data_time: 0.72s time: 520.47s eta: 3 days, 10:59:11
|
| 296 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 30100.0 tgs: 58 data_time: 0.75s time: 518.43s eta: 3 days, 10:31:03
|
| 298 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.8GB text_tokens: 31525.0 tgs: 60 data_time: 0.88s time: 522.84s eta: 3 days, 11:04:25
|
| 300 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32200.0 tgs: 61 data_time: 0.87s time: 523.51s eta: 3 days, 11:02:04
|
| 302 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 31533.0 tgs: 60 data_time: 0.96s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31572.0 tgs: 60 data_time: 0.73s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31711.0 tgs: 60 data_time: 0.55s time: 523.30s eta: 3 days, 10:33:51
|
| 308 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.7GB text_tokens: 31027.0 tgs: 59 data_time: 0.77s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.8GB text_tokens: 31174.0 tgs: 59 data_time: 1.00s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 31786.0 tgs: 61 data_time: 0.87s time: 520.64s eta: 3 days, 9:42:40
|
| 314 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.9GB text_tokens: 31831.0 tgs: 60 data_time: 0.89s time: 522.40s eta: 3 days, 9:50:31
|
| 316 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.9GB text_tokens: 31822.0 tgs: 60 data_time: 0.85s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 31024.0 tgs: 59 data_time: 0.74s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.1GB text_tokens: 31792.0 tgs: 61 data_time: 0.98s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.7GB text_tokens: 31319.0 tgs: 60 data_time: 0.59s time: 521.29s eta: 3 days, 9:05:20
|
| 324 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.341 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32256.0 tgs: 61 data_time: 0.62s time: 524.11s eta: 3 days, 9:22:58
|
| 326 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32359.0 tgs: 62 data_time: 0.83s time: 520.96s eta: 3 days, 8:44:55
|
| 328 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.7GB text_tokens: 31433.0 tgs: 60 data_time: 0.88s time: 520.17s eta: 3 days, 8:28:52
|
| 330 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 30457.0 tgs: 58 data_time: 0.84s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 32061.0 tgs: 61 data_time: 0.84s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.9GB text_tokens: 30958.0 tgs: 59 data_time: 0.65s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32067.0 tgs: 61 data_time: 0.83s time: 520.10s eta: 3 days, 7:53:34
|
| 338 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31896.0 tgs: 61 data_time: 0.92s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.8GB text_tokens: 30763.0 tgs: 58 data_time: 0.72s time: 524.35s eta: 3 days, 8:15:18
|
| 342 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.9GB text_tokens: 31487.0 tgs: 60 data_time: 0.87s time: 520.51s eta: 3 days, 7:31:20
|
| 344 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 31702.0 tgs: 61 data_time: 0.70s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.9GB text_tokens: 31920.0 tgs: 61 data_time: 0.92s time: 523.16s eta: 3 days, 7:38:09
|
| 348 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31789.0 tgs: 60 data_time: 0.75s time: 523.63s eta: 3 days, 7:33:42
|
| 350 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31065.0 tgs: 59 data_time: 0.77s time: 520.16s eta: 3 days, 6:53:28
|
| 352 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 32174.0 tgs: 61 data_time: 0.75s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.346 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31306.0 tgs: 59 data_time: 1.02s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 29824.0 tgs: 56 data_time: 0.66s time: 523.77s eta: 3 days, 7:00:06
|
| 358 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.7GB text_tokens: 31180.0 tgs: 59 data_time: 1.06s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.0GB text_tokens: 31929.0 tgs: 61 data_time: 0.86s time: 518.78s eta: 3 days, 5:57:38
|
| 362 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32302.0 tgs: 61 data_time: 0.54s time: 522.69s eta: 3 days, 6:24:10
|
| 364 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31344.0 tgs: 59 data_time: 0.66s time: 522.74s eta: 3 days, 6:15:57
|
| 366 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 30272.0 tgs: 58 data_time: 0.76s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32061.0 tgs: 61 data_time: 0.67s time: 519.66s eta: 3 days, 5:30:57
|
| 370 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 31885.0 tgs: 61 data_time: 0.66s time: 521.28s eta: 3 days, 5:36:47
|
| 372 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.1GB text_tokens: 32402.0 tgs: 61 data_time: 1.18s time: 523.59s eta: 3 days, 5:48:42
|
| 374 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.4GB text_tokens: 30488.0 tgs: 58 data_time: 0.85s time: 519.32s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 31879.0 tgs: 61 data_time: 0.55s time: 520.65s eta: 3 days, 5:05:08
|
| 378 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31778.0 tgs: 60 data_time: 0.56s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31838.0 tgs: 60 data_time: 0.92s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32028.0 tgs: 61 data_time: 1.33s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 30998.0 tgs: 59 data_time: 0.65s time: 519.25s eta: 3 days, 4:18:03
|
| 386 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.194 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31047.0 tgs: 59 data_time: 0.85s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 32302.0 tgs: 61 data_time: 0.84s time: 524.25s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.6GB text_tokens: 30866.0 tgs: 59 data_time: 0.61s time: 520.29s eta: 3 days, 4:01:14
|
| 392 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31849.0 tgs: 61 data_time: 0.72s time: 519.50s eta: 3 days, 3:45:36
|
| 394 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 23][DP 5][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31641.0 tgs: 60 data_time: 0.80s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank25.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.15s
|
| 12 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 141.66 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31349.0 tgs: 57 data_time: 1.89s time: 547.07s eta: 3 days, 18:06:50
|
| 258 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32045.0 tgs: 61 data_time: 0.55s time: 523.25s eta: 3 days, 14:02:46
|
| 260 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.0GB text_tokens: 31775.0 tgs: 60 data_time: 0.90s time: 522.85s eta: 3 days, 13:50:02
|
| 262 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31442.0 tgs: 60 data_time: 0.83s time: 520.30s eta: 3 days, 13:16:16
|
| 264 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31310.0 tgs: 60 data_time: 0.74s time: 520.99s eta: 3 days, 13:14:24
|
| 266 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31970.0 tgs: 61 data_time: 0.84s time: 520.88s eta: 3 days, 13:04:40
|
| 268 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32046.0 tgs: 61 data_time: 0.83s time: 523.39s eta: 3 days, 13:20:31
|
| 270 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32280.0 tgs: 61 data_time: 0.66s time: 520.67s eta: 3 days, 12:45:12
|
| 272 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32093.0 tgs: 61 data_time: 1.03s time: 520.19s eta: 3 days, 12:31:48
|
| 274 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 32007.0 tgs: 61 data_time: 0.54s time: 520.38s eta: 3 days, 12:24:59
|
| 276 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32301.0 tgs: 61 data_time: 0.81s time: 524.54s eta: 3 days, 12:56:47
|
| 278 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.3GB text_tokens: 30685.0 tgs: 58 data_time: 0.81s time: 520.66s eta: 3 days, 12:10:22
|
| 280 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 30776.0 tgs: 59 data_time: 0.86s time: 519.93s eta: 3 days, 11:54:37
|
| 282 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.318 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31850.0 tgs: 61 data_time: 0.72s time: 521.30s eta: 3 days, 11:59:11
|
| 284 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 32016.0 tgs: 61 data_time: 0.75s time: 524.13s eta: 3 days, 12:17:52
|
| 286 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 31587.0 tgs: 60 data_time: 1.02s time: 520.55s eta: 3 days, 11:34:36
|
| 288 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.350 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 30793.0 tgs: 59 data_time: 0.76s time: 518.80s eta: 3 days, 11:09:05
|
| 290 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31008.0 tgs: 59 data_time: 0.92s time: 522.11s eta: 3 days, 11:32:17
|
| 292 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32462.0 tgs: 61 data_time: 0.88s time: 523.84s eta: 3 days, 11:40:07
|
| 294 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32193.0 tgs: 61 data_time: 0.68s time: 520.44s eta: 3 days, 10:58:51
|
| 296 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31420.0 tgs: 60 data_time: 0.80s time: 518.44s eta: 3 days, 10:31:06
|
| 298 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31579.0 tgs: 60 data_time: 0.94s time: 522.85s eta: 3 days, 11:04:28
|
| 300 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31823.0 tgs: 60 data_time: 0.82s time: 523.52s eta: 3 days, 11:02:08
|
| 302 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.8GB text_tokens: 30152.0 tgs: 57 data_time: 0.81s time: 520.97s eta: 3 days, 10:29:11
|
| 304 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.8GB text_tokens: 31164.0 tgs: 60 data_time: 0.87s time: 519.11s eta: 3 days, 10:02:53
|
| 306 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.9GB text_tokens: 31740.0 tgs: 60 data_time: 0.82s time: 523.30s eta: 3 days, 10:33:54
|
| 308 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32120.0 tgs: 61 data_time: 0.91s time: 522.91s eta: 3 days, 10:21:30
|
| 310 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.5GB text_tokens: 30357.0 tgs: 58 data_time: 0.77s time: 521.32s eta: 3 days, 9:57:49
|
| 312 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 32083.0 tgs: 61 data_time: 0.87s time: 520.64s eta: 3 days, 9:42:44
|
| 314 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.354 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.9GB text_tokens: 31963.0 tgs: 61 data_time: 0.88s time: 522.39s eta: 3 days, 9:50:29
|
| 316 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.80s time: 523.76s eta: 3 days, 9:54:37
|
| 318 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31654.0 tgs: 60 data_time: 0.80s time: 520.43s eta: 3 days, 9:14:43
|
| 320 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.64s time: 520.68s eta: 3 days, 9:08:23
|
| 322 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31836.0 tgs: 61 data_time: 0.85s time: 521.26s eta: 3 days, 9:05:03
|
| 324 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.9GB text_tokens: 31221.0 tgs: 59 data_time: 0.72s time: 524.12s eta: 3 days, 9:23:02
|
| 326 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31303.0 tgs: 60 data_time: 0.95s time: 520.97s eta: 3 days, 8:44:58
|
| 328 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31855.0 tgs: 61 data_time: 1.01s time: 520.16s eta: 3 days, 8:28:49
|
| 330 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 30932.0 tgs: 59 data_time: 0.70s time: 522.57s eta: 3 days, 8:42:29
|
| 332 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31919.0 tgs: 60 data_time: 0.61s time: 524.34s eta: 3 days, 8:50:08
|
| 334 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31714.0 tgs: 61 data_time: 0.63s time: 519.89s eta: 3 days, 8:00:20
|
| 336 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.9GB text_tokens: 32069.0 tgs: 61 data_time: 0.73s time: 520.07s eta: 3 days, 7:53:18
|
| 338 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31550.0 tgs: 60 data_time: 0.62s time: 522.01s eta: 3 days, 8:02:26
|
| 340 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.2GB text_tokens: 30417.0 tgs: 58 data_time: 0.53s time: 524.36s eta: 3 days, 8:15:22
|
| 342 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31874.0 tgs: 61 data_time: 0.85s time: 520.50s eta: 3 days, 7:31:13
|
| 344 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.7GB text_tokens: 31865.0 tgs: 61 data_time: 0.61s time: 518.57s eta: 3 days, 7:04:57
|
| 346 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:50:58][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32131.0 tgs: 61 data_time: 0.95s time: 523.16s eta: 3 days, 7:38:12
|
| 348 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 32443.0 tgs: 61 data_time: 0.67s time: 523.63s eta: 3 days, 7:33:46
|
| 350 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.8GB text_tokens: 31045.0 tgs: 59 data_time: 0.76s time: 520.14s eta: 3 days, 6:53:16
|
| 352 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.9GB text_tokens: 31846.0 tgs: 61 data_time: 0.75s time: 520.06s eta: 3 days, 6:43:54
|
| 354 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31945.0 tgs: 60 data_time: 0.96s time: 523.77s eta: 3 days, 7:08:48
|
| 356 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.9GB text_tokens: 32011.0 tgs: 61 data_time: 1.08s time: 523.78s eta: 3 days, 7:00:11
|
| 358 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32547.0 tgs: 62 data_time: 0.97s time: 520.25s eta: 3 days, 6:19:33
|
| 360 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32049.0 tgs: 61 data_time: 0.86s time: 518.79s eta: 3 days, 5:57:42
|
| 362 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 32269.0 tgs: 61 data_time: 0.85s time: 522.67s eta: 3 days, 6:24:03
|
| 364 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 30865.0 tgs: 59 data_time: 1.08s time: 522.70s eta: 3 days, 6:15:36
|
| 366 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.7GB text_tokens: 31114.0 tgs: 59 data_time: 1.09s time: 520.53s eta: 3 days, 5:47:27
|
| 368 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 32034.0 tgs: 61 data_time: 0.82s time: 519.67s eta: 3 days, 5:31:00
|
| 370 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.333 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.6GB text_tokens: 31575.0 tgs: 60 data_time: 1.25s time: 521.28s eta: 3 days, 5:36:47
|
| 372 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.4GB text_tokens: 30677.0 tgs: 58 data_time: 0.73s time: 523.60s eta: 3 days, 5:48:45
|
| 374 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31925.0 tgs: 61 data_time: 0.93s time: 519.33s eta: 3 days, 5:02:02
|
| 376 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32154.0 tgs: 61 data_time: 0.61s time: 520.63s eta: 3 days, 5:04:53
|
| 378 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 32123.0 tgs: 61 data_time: 0.82s time: 521.46s eta: 3 days, 5:03:35
|
| 380 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31246.0 tgs: 59 data_time: 0.91s time: 523.95s eta: 3 days, 5:16:58
|
| 382 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31119.0 tgs: 59 data_time: 0.87s time: 518.86s eta: 3 days, 4:23:16
|
| 384 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 31196.0 tgs: 60 data_time: 0.60s time: 519.23s eta: 3 days, 4:17:54
|
| 386 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31947.0 tgs: 61 data_time: 0.69s time: 520.72s eta: 3 days, 4:22:21
|
| 388 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.6GB text_tokens: 31639.0 tgs: 60 data_time: 0.84s time: 524.26s eta: 3 days, 4:44:45
|
| 390 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.0GB text_tokens: 31322.0 tgs: 60 data_time: 1.05s time: 520.28s eta: 3 days, 4:01:06
|
| 392 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.0GB text_tokens: 31116.0 tgs: 59 data_time: 0.81s time: 519.50s eta: 3 days, 3:45:39
|
| 394 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 25][DP 6][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 32184.0 tgs: 61 data_time: 0.66s time: 522.33s eta: 3 days, 4:01:38
|
20250120235238/rank26.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.28s
|
| 12 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 141.63 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31349.0 tgs: 57 data_time: 1.75s time: 547.03s eta: 3 days, 18:06:31
|
| 258 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.208 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32045.0 tgs: 61 data_time: 0.54s time: 523.25s eta: 3 days, 14:02:44
|
| 260 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.0GB text_tokens: 31775.0 tgs: 60 data_time: 0.92s time: 522.85s eta: 3 days, 13:50:04
|
| 262 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31442.0 tgs: 60 data_time: 0.83s time: 520.30s eta: 3 days, 13:16:17
|
| 264 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31310.0 tgs: 60 data_time: 0.76s time: 520.99s eta: 3 days, 13:14:24
|
| 266 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31970.0 tgs: 61 data_time: 0.88s time: 520.89s eta: 3 days, 13:04:41
|
| 268 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32046.0 tgs: 61 data_time: 0.85s time: 523.39s eta: 3 days, 13:20:31
|
| 270 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32280.0 tgs: 61 data_time: 0.67s time: 520.67s eta: 3 days, 12:45:15
|
| 272 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32093.0 tgs: 61 data_time: 1.04s time: 520.18s eta: 3 days, 12:31:46
|
| 274 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 32007.0 tgs: 61 data_time: 0.54s time: 520.38s eta: 3 days, 12:24:59
|
| 276 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32301.0 tgs: 61 data_time: 0.81s time: 524.54s eta: 3 days, 12:56:48
|
| 278 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.3GB text_tokens: 30685.0 tgs: 58 data_time: 0.83s time: 520.66s eta: 3 days, 12:10:23
|
| 280 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.199 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 30776.0 tgs: 59 data_time: 0.86s time: 519.93s eta: 3 days, 11:54:37
|
| 282 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.0GB text_tokens: 31850.0 tgs: 61 data_time: 0.74s time: 521.30s eta: 3 days, 11:59:12
|
| 284 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 32016.0 tgs: 61 data_time: 0.76s time: 524.13s eta: 3 days, 12:17:52
|
| 286 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 31587.0 tgs: 60 data_time: 1.03s time: 520.55s eta: 3 days, 11:34:36
|
| 288 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 30793.0 tgs: 59 data_time: 0.78s time: 518.80s eta: 3 days, 11:09:06
|
| 290 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31008.0 tgs: 59 data_time: 0.92s time: 522.11s eta: 3 days, 11:32:18
|
| 292 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32462.0 tgs: 61 data_time: 0.88s time: 523.84s eta: 3 days, 11:40:07
|
| 294 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32193.0 tgs: 61 data_time: 0.69s time: 520.44s eta: 3 days, 10:58:51
|
| 296 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31420.0 tgs: 60 data_time: 0.82s time: 518.44s eta: 3 days, 10:31:07
|
| 298 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31579.0 tgs: 60 data_time: 0.95s time: 522.85s eta: 3 days, 11:04:29
|
| 300 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31823.0 tgs: 60 data_time: 0.83s time: 523.52s eta: 3 days, 11:02:08
|
| 302 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.8GB text_tokens: 30152.0 tgs: 57 data_time: 0.82s time: 520.97s eta: 3 days, 10:29:12
|
| 304 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.8GB text_tokens: 31164.0 tgs: 60 data_time: 0.93s time: 519.11s eta: 3 days, 10:02:53
|
| 306 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.9GB text_tokens: 31740.0 tgs: 60 data_time: 0.84s time: 523.30s eta: 3 days, 10:33:54
|
| 308 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32120.0 tgs: 61 data_time: 0.96s time: 522.91s eta: 3 days, 10:21:31
|
| 310 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.5GB text_tokens: 30357.0 tgs: 58 data_time: 0.78s time: 521.33s eta: 3 days, 9:57:50
|
| 312 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 32083.0 tgs: 61 data_time: 0.88s time: 520.65s eta: 3 days, 9:42:44
|
| 314 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.9GB text_tokens: 31963.0 tgs: 61 data_time: 0.89s time: 522.39s eta: 3 days, 9:50:30
|
| 316 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.81s time: 523.76s eta: 3 days, 9:54:36
|
| 318 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31654.0 tgs: 60 data_time: 0.80s time: 520.43s eta: 3 days, 9:14:44
|
| 320 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.65s time: 520.69s eta: 3 days, 9:08:24
|
| 322 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31836.0 tgs: 61 data_time: 0.92s time: 521.26s eta: 3 days, 9:05:03
|
| 324 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.9GB text_tokens: 31221.0 tgs: 59 data_time: 0.75s time: 524.12s eta: 3 days, 9:23:03
|
| 326 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31303.0 tgs: 60 data_time: 0.98s time: 520.97s eta: 3 days, 8:44:59
|
| 328 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31855.0 tgs: 61 data_time: 1.06s time: 520.16s eta: 3 days, 8:28:50
|
| 330 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 30932.0 tgs: 59 data_time: 0.71s time: 522.57s eta: 3 days, 8:42:30
|
| 332 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31919.0 tgs: 60 data_time: 0.63s time: 524.34s eta: 3 days, 8:50:09
|
| 334 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31714.0 tgs: 61 data_time: 0.62s time: 519.89s eta: 3 days, 8:00:21
|
| 336 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.9GB text_tokens: 32069.0 tgs: 61 data_time: 0.74s time: 520.07s eta: 3 days, 7:53:18
|
| 338 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 31550.0 tgs: 60 data_time: 0.62s time: 522.01s eta: 3 days, 8:02:27
|
| 340 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.2GB text_tokens: 30417.0 tgs: 58 data_time: 0.53s time: 524.36s eta: 3 days, 8:15:22
|
| 342 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31874.0 tgs: 61 data_time: 0.85s time: 520.50s eta: 3 days, 7:31:14
|
| 344 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.7GB text_tokens: 31865.0 tgs: 61 data_time: 0.61s time: 518.57s eta: 3 days, 7:04:56
|
| 346 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32131.0 tgs: 61 data_time: 0.95s time: 523.16s eta: 3 days, 7:38:12
|
| 348 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 32443.0 tgs: 61 data_time: 0.67s time: 523.63s eta: 3 days, 7:33:47
|
| 350 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.8GB text_tokens: 31045.0 tgs: 59 data_time: 0.74s time: 520.14s eta: 3 days, 6:53:17
|
| 352 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.9GB text_tokens: 31846.0 tgs: 61 data_time: 0.76s time: 520.06s eta: 3 days, 6:43:54
|
| 354 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31945.0 tgs: 60 data_time: 0.97s time: 523.77s eta: 3 days, 7:08:49
|
| 356 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.9GB text_tokens: 32011.0 tgs: 61 data_time: 1.09s time: 523.78s eta: 3 days, 7:00:11
|
| 358 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32547.0 tgs: 62 data_time: 0.98s time: 520.25s eta: 3 days, 6:19:34
|
| 360 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32049.0 tgs: 61 data_time: 0.87s time: 518.79s eta: 3 days, 5:57:43
|
| 362 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 32269.0 tgs: 61 data_time: 0.86s time: 522.67s eta: 3 days, 6:24:02
|
| 364 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 30865.0 tgs: 59 data_time: 1.09s time: 522.71s eta: 3 days, 6:15:38
|
| 366 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.7GB text_tokens: 31114.0 tgs: 59 data_time: 1.10s time: 520.53s eta: 3 days, 5:47:27
|
| 368 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 32034.0 tgs: 61 data_time: 0.84s time: 519.67s eta: 3 days, 5:31:01
|
| 370 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.6GB text_tokens: 31575.0 tgs: 60 data_time: 1.27s time: 521.28s eta: 3 days, 5:36:47
|
| 372 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.4GB text_tokens: 30677.0 tgs: 58 data_time: 0.77s time: 523.60s eta: 3 days, 5:48:45
|
| 374 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31925.0 tgs: 61 data_time: 0.93s time: 519.33s eta: 3 days, 5:02:03
|
| 376 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32154.0 tgs: 61 data_time: 0.64s time: 520.63s eta: 3 days, 5:04:54
|
| 378 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 32123.0 tgs: 61 data_time: 0.84s time: 521.46s eta: 3 days, 5:03:36
|
| 380 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31246.0 tgs: 59 data_time: 0.94s time: 523.95s eta: 3 days, 5:16:59
|
| 382 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31119.0 tgs: 59 data_time: 0.88s time: 518.86s eta: 3 days, 4:23:15
|
| 384 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.207 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 31196.0 tgs: 60 data_time: 0.61s time: 519.24s eta: 3 days, 4:17:55
|
| 386 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.350 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31947.0 tgs: 61 data_time: 0.73s time: 520.72s eta: 3 days, 4:22:22
|
| 388 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.6GB text_tokens: 31639.0 tgs: 60 data_time: 0.85s time: 524.26s eta: 3 days, 4:44:46
|
| 390 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.0GB text_tokens: 31322.0 tgs: 60 data_time: 1.06s time: 520.28s eta: 3 days, 4:01:06
|
| 392 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.0GB text_tokens: 31116.0 tgs: 59 data_time: 0.81s time: 519.50s eta: 3 days, 3:45:39
|
| 394 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 26][DP 6][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 32184.0 tgs: 61 data_time: 0.66s time: 522.33s eta: 3 days, 4:01:39
|
20250120235238/rank3.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:07:59][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 144.62 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.7GB text_tokens: 31548.0 tgs: 57 data_time: 1.96s time: 546.73s eta: 3 days, 18:03:31
|
| 258 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32129.0 tgs: 61 data_time: 0.75s time: 523.23s eta: 3 days, 14:02:30
|
| 260 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.9GB text_tokens: 31947.0 tgs: 61 data_time: 0.80s time: 522.94s eta: 3 days, 13:50:56
|
| 262 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31471.0 tgs: 60 data_time: 0.69s time: 520.27s eta: 3 days, 13:16:02
|
| 264 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 32503.0 tgs: 62 data_time: 0.71s time: 520.97s eta: 3 days, 13:14:10
|
| 266 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.360 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.7GB text_tokens: 31133.0 tgs: 59 data_time: 0.84s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.339 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31847.0 tgs: 60 data_time: 1.18s time: 523.37s eta: 3 days, 13:20:17
|
| 270 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 32030.0 tgs: 61 data_time: 1.01s time: 520.65s eta: 3 days, 12:44:58
|
| 272 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 31786.0 tgs: 61 data_time: 0.70s time: 520.16s eta: 3 days, 12:31:35
|
| 274 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 32078.0 tgs: 61 data_time: 1.04s time: 520.47s eta: 3 days, 12:25:56
|
| 276 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32240.0 tgs: 61 data_time: 0.89s time: 524.52s eta: 3 days, 12:56:32
|
| 278 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31632.0 tgs: 60 data_time: 0.84s time: 520.63s eta: 3 days, 12:10:08
|
| 280 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.1GB text_tokens: 30691.0 tgs: 59 data_time: 0.74s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32119.0 tgs: 61 data_time: 0.85s time: 521.27s eta: 3 days, 11:58:57
|
| 284 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.198 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 31154.0 tgs: 59 data_time: 0.64s time: 524.11s eta: 3 days, 12:17:39
|
| 286 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.9GB text_tokens: 31896.0 tgs: 61 data_time: 0.61s time: 520.52s eta: 3 days, 11:34:22
|
| 288 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.193 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31357.0 tgs: 60 data_time: 0.88s time: 518.89s eta: 3 days, 11:09:59
|
| 290 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 30112.0 tgs: 57 data_time: 0.68s time: 522.09s eta: 3 days, 11:32:04
|
| 292 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.1GB text_tokens: 31775.0 tgs: 60 data_time: 0.86s time: 523.81s eta: 3 days, 11:39:52
|
| 294 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32263.0 tgs: 61 data_time: 0.79s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31621.0 tgs: 60 data_time: 1.05s time: 518.42s eta: 3 days, 10:30:52
|
| 298 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.362 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 32165.0 tgs: 61 data_time: 1.04s time: 522.82s eta: 3 days, 11:04:15
|
| 300 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31731.0 tgs: 60 data_time: 0.57s time: 523.52s eta: 3 days, 11:02:10
|
| 302 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31953.0 tgs: 61 data_time: 0.71s time: 521.01s eta: 3 days, 10:29:36
|
| 304 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 32071.0 tgs: 61 data_time: 0.80s time: 519.08s eta: 3 days, 10:02:39
|
| 306 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32232.0 tgs: 61 data_time: 0.61s time: 523.28s eta: 3 days, 10:33:41
|
| 308 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31872.0 tgs: 60 data_time: 0.68s time: 522.97s eta: 3 days, 10:22:03
|
| 310 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 31715.0 tgs: 60 data_time: 0.79s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31142.0 tgs: 59 data_time: 0.66s time: 520.62s eta: 3 days, 9:42:30
|
| 314 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.7GB text_tokens: 31246.0 tgs: 59 data_time: 1.08s time: 522.42s eta: 3 days, 9:50:45
|
| 316 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.8GB text_tokens: 31427.0 tgs: 60 data_time: 0.86s time: 523.77s eta: 3 days, 9:54:42
|
| 318 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 31009.0 tgs: 59 data_time: 0.81s time: 520.41s eta: 3 days, 9:14:30
|
| 320 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.6GB text_tokens: 31125.0 tgs: 59 data_time: 0.68s time: 520.66s eta: 3 days, 9:08:11
|
| 322 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 32138.0 tgs: 61 data_time: 0.88s time: 521.32s eta: 3 days, 9:05:39
|
| 324 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.8GB text_tokens: 31588.0 tgs: 60 data_time: 0.91s time: 524.09s eta: 3 days, 9:22:48
|
| 326 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.8GB text_tokens: 31748.0 tgs: 60 data_time: 0.85s time: 520.94s eta: 3 days, 8:44:45
|
| 328 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 32016.0 tgs: 61 data_time: 0.91s time: 520.21s eta: 3 days, 8:29:19
|
| 330 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.9GB text_tokens: 31540.0 tgs: 60 data_time: 0.73s time: 522.55s eta: 3 days, 8:42:18
|
| 332 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 30666.0 tgs: 58 data_time: 0.79s time: 524.32s eta: 3 days, 8:49:55
|
| 334 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31939.0 tgs: 61 data_time: 0.74s time: 519.87s eta: 3 days, 8:00:06
|
| 336 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 30286.0 tgs: 58 data_time: 0.78s time: 520.12s eta: 3 days, 7:53:45
|
| 338 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32181.0 tgs: 61 data_time: 0.84s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32188.0 tgs: 61 data_time: 0.71s time: 524.34s eta: 3 days, 8:15:09
|
| 342 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32373.0 tgs: 62 data_time: 0.66s time: 520.56s eta: 3 days, 7:31:45
|
| 344 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.7GB text_tokens: 30468.0 tgs: 58 data_time: 0.61s time: 518.55s eta: 3 days, 7:04:44
|
| 346 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31398.0 tgs: 60 data_time: 0.72s time: 523.14s eta: 3 days, 7:37:59
|
| 348 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.9GB text_tokens: 30639.0 tgs: 58 data_time: 0.56s time: 523.61s eta: 3 days, 7:33:33
|
| 350 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32054.0 tgs: 61 data_time: 0.90s time: 520.20s eta: 3 days, 6:53:50
|
| 352 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.8GB text_tokens: 31329.0 tgs: 60 data_time: 0.89s time: 520.04s eta: 3 days, 6:43:41
|
| 354 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.332 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 30789.0 tgs: 58 data_time: 0.72s time: 523.74s eta: 3 days, 7:08:35
|
| 356 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31630.0 tgs: 60 data_time: 0.76s time: 523.83s eta: 3 days, 7:00:37
|
| 358 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 31039.0 tgs: 59 data_time: 0.77s time: 520.22s eta: 3 days, 6:19:21
|
| 360 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31956.0 tgs: 61 data_time: 1.00s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 32128.0 tgs: 61 data_time: 0.64s time: 522.67s eta: 3 days, 6:24:01
|
| 364 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 31665.0 tgs: 60 data_time: 0.64s time: 522.77s eta: 3 days, 6:16:11
|
| 366 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 31563.0 tgs: 60 data_time: 0.71s time: 520.51s eta: 3 days, 5:47:15
|
| 368 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32095.0 tgs: 61 data_time: 0.65s time: 519.64s eta: 3 days, 5:30:47
|
| 370 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32082.0 tgs: 61 data_time: 1.03s time: 521.34s eta: 3 days, 5:37:16
|
| 372 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.319 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.1GB text_tokens: 32250.0 tgs: 61 data_time: 0.66s time: 523.57s eta: 3 days, 5:48:31
|
| 374 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31468.0 tgs: 60 data_time: 0.75s time: 519.29s eta: 3 days, 5:01:41
|
| 376 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 31281.0 tgs: 60 data_time: 0.80s time: 520.67s eta: 3 days, 5:05:14
|
| 378 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 31987.0 tgs: 61 data_time: 0.51s time: 521.47s eta: 3 days, 5:03:41
|
| 380 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.3GB text_tokens: 30618.0 tgs: 58 data_time: 0.58s time: 523.93s eta: 3 days, 5:16:46
|
| 382 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 32.4GB text_tokens: 30691.0 tgs: 59 data_time: 0.65s time: 518.84s eta: 3 days, 4:23:02
|
| 384 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.9GB text_tokens: 30766.0 tgs: 59 data_time: 0.61s time: 519.27s eta: 3 days, 4:18:14
|
| 386 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 30736.0 tgs: 59 data_time: 0.70s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.357 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 32451.0 tgs: 61 data_time: 0.70s time: 524.24s eta: 3 days, 4:44:33
|
| 390 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.0GB text_tokens: 30934.0 tgs: 59 data_time: 0.89s time: 520.34s eta: 3 days, 4:01:38
|
| 392 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.318 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32071.0 tgs: 61 data_time: 0.73s time: 519.52s eta: 3 days, 3:45:47
|
| 394 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 3][DP 0][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31757.0 tgs: 60 data_time: 0.90s time: 522.30s eta: 3 days, 4:01:26
|
20250120235238/rank31.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:08:02][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 141.70 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32271.0 tgs: 58 data_time: 1.84s time: 547.03s eta: 3 days, 18:06:29
|
| 258 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32224.0 tgs: 61 data_time: 0.93s time: 523.25s eta: 3 days, 14:02:46
|
| 260 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 31186.0 tgs: 59 data_time: 0.93s time: 522.85s eta: 3 days, 13:50:04
|
| 262 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32092.0 tgs: 61 data_time: 0.81s time: 520.30s eta: 3 days, 13:16:16
|
| 264 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31643.0 tgs: 60 data_time: 0.77s time: 520.99s eta: 3 days, 13:14:24
|
| 266 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 31263.0 tgs: 60 data_time: 0.72s time: 520.89s eta: 3 days, 13:04:41
|
| 268 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.342 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.7GB text_tokens: 30097.0 tgs: 57 data_time: 0.80s time: 523.39s eta: 3 days, 13:20:31
|
| 270 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.346 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32404.0 tgs: 62 data_time: 0.88s time: 520.67s eta: 3 days, 12:45:13
|
| 272 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 32.9GB text_tokens: 30582.0 tgs: 58 data_time: 0.82s time: 520.19s eta: 3 days, 12:31:49
|
| 274 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32372.0 tgs: 62 data_time: 0.80s time: 520.38s eta: 3 days, 12:25:00
|
| 276 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32256.0 tgs: 61 data_time: 0.71s time: 524.54s eta: 3 days, 12:56:47
|
| 278 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31025.0 tgs: 59 data_time: 0.63s time: 520.66s eta: 3 days, 12:10:23
|
| 280 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 31804.0 tgs: 61 data_time: 0.87s time: 519.93s eta: 3 days, 11:54:37
|
| 282 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32389.0 tgs: 62 data_time: 0.79s time: 521.30s eta: 3 days, 11:59:12
|
| 284 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 30946.0 tgs: 59 data_time: 0.87s time: 524.13s eta: 3 days, 12:17:53
|
| 286 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31432.0 tgs: 60 data_time: 0.90s time: 520.55s eta: 3 days, 11:34:36
|
| 288 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32194.0 tgs: 62 data_time: 0.73s time: 518.80s eta: 3 days, 11:09:06
|
| 290 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 31342.0 tgs: 60 data_time: 1.05s time: 522.11s eta: 3 days, 11:32:18
|
| 292 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32054.0 tgs: 61 data_time: 0.67s time: 523.84s eta: 3 days, 11:40:07
|
| 294 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32360.0 tgs: 62 data_time: 0.72s time: 520.44s eta: 3 days, 10:58:52
|
| 296 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 31497.0 tgs: 60 data_time: 0.61s time: 518.44s eta: 3 days, 10:31:06
|
| 298 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 32299.0 tgs: 61 data_time: 0.81s time: 522.85s eta: 3 days, 11:04:29
|
| 300 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32054.0 tgs: 61 data_time: 0.64s time: 523.52s eta: 3 days, 11:02:08
|
| 302 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31131.0 tgs: 59 data_time: 0.62s time: 520.97s eta: 3 days, 10:29:12
|
| 304 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 32103.0 tgs: 61 data_time: 0.75s time: 519.11s eta: 3 days, 10:02:53
|
| 306 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 32152.0 tgs: 61 data_time: 0.66s time: 523.30s eta: 3 days, 10:33:55
|
| 308 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.9GB text_tokens: 31336.0 tgs: 59 data_time: 0.96s time: 522.91s eta: 3 days, 10:21:31
|
| 310 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 31412.0 tgs: 60 data_time: 0.71s time: 521.33s eta: 3 days, 9:57:50
|
| 312 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32135.0 tgs: 61 data_time: 0.69s time: 520.64s eta: 3 days, 9:42:44
|
| 314 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.9GB text_tokens: 31733.0 tgs: 60 data_time: 0.85s time: 522.39s eta: 3 days, 9:50:30
|
| 316 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 31765.0 tgs: 60 data_time: 0.85s time: 523.76s eta: 3 days, 9:54:37
|
| 318 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 31949.0 tgs: 61 data_time: 0.91s time: 520.44s eta: 3 days, 9:14:45
|
| 320 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.9GB text_tokens: 31335.0 tgs: 60 data_time: 0.60s time: 520.68s eta: 3 days, 9:08:24
|
| 322 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31518.0 tgs: 60 data_time: 0.95s time: 521.26s eta: 3 days, 9:05:03
|
| 324 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.208 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31987.0 tgs: 61 data_time: 0.92s time: 524.12s eta: 3 days, 9:23:02
|
| 326 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32039.0 tgs: 61 data_time: 1.27s time: 520.97s eta: 3 days, 8:44:59
|
| 328 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 32103.0 tgs: 61 data_time: 0.77s time: 520.16s eta: 3 days, 8:28:49
|
| 330 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32352.0 tgs: 61 data_time: 0.76s time: 522.57s eta: 3 days, 8:42:29
|
| 332 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.0GB text_tokens: 31385.0 tgs: 59 data_time: 1.01s time: 524.34s eta: 3 days, 8:50:08
|
| 334 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 31598.0 tgs: 60 data_time: 0.71s time: 519.90s eta: 3 days, 8:00:21
|
| 336 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31223.0 tgs: 60 data_time: 0.76s time: 520.07s eta: 3 days, 7:53:18
|
| 338 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32577.0 tgs: 62 data_time: 1.09s time: 522.01s eta: 3 days, 8:02:27
|
| 340 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.9GB text_tokens: 31503.0 tgs: 60 data_time: 0.75s time: 524.36s eta: 3 days, 8:15:22
|
| 342 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.330 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.9GB text_tokens: 31442.0 tgs: 60 data_time: 0.72s time: 520.50s eta: 3 days, 7:31:13
|
| 344 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31080.0 tgs: 59 data_time: 0.84s time: 518.57s eta: 3 days, 7:04:56
|
| 346 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31762.0 tgs: 60 data_time: 0.72s time: 523.16s eta: 3 days, 7:38:13
|
| 348 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.8GB text_tokens: 31859.0 tgs: 60 data_time: 0.84s time: 523.64s eta: 3 days, 7:33:48
|
| 350 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32052.0 tgs: 61 data_time: 0.60s time: 520.14s eta: 3 days, 6:53:15
|
| 352 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.7GB text_tokens: 31795.0 tgs: 61 data_time: 0.77s time: 520.06s eta: 3 days, 6:43:54
|
| 354 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31424.0 tgs: 59 data_time: 0.75s time: 523.77s eta: 3 days, 7:08:49
|
| 356 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 32138.0 tgs: 61 data_time: 0.70s time: 523.78s eta: 3 days, 7:00:10
|
| 358 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32221.0 tgs: 61 data_time: 0.92s time: 520.25s eta: 3 days, 6:19:34
|
| 360 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.0GB text_tokens: 31787.0 tgs: 61 data_time: 0.86s time: 518.79s eta: 3 days, 5:57:42
|
| 362 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31727.0 tgs: 60 data_time: 0.65s time: 522.67s eta: 3 days, 6:24:04
|
| 364 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 31831.0 tgs: 60 data_time: 0.89s time: 522.70s eta: 3 days, 6:15:37
|
| 366 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 32054.0 tgs: 61 data_time: 0.94s time: 520.53s eta: 3 days, 5:47:27
|
| 368 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.213 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.7GB text_tokens: 31492.0 tgs: 60 data_time: 0.82s time: 519.67s eta: 3 days, 5:31:00
|
| 370 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.7GB text_tokens: 31059.0 tgs: 59 data_time: 0.68s time: 521.28s eta: 3 days, 5:36:48
|
| 372 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 30232.0 tgs: 57 data_time: 0.77s time: 523.60s eta: 3 days, 5:48:45
|
| 374 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.347 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.8GB text_tokens: 31767.0 tgs: 61 data_time: 0.80s time: 519.33s eta: 3 days, 5:02:03
|
| 376 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 32.9GB text_tokens: 32058.0 tgs: 61 data_time: 0.57s time: 520.63s eta: 3 days, 5:04:53
|
| 378 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 32164.0 tgs: 61 data_time: 0.65s time: 521.46s eta: 3 days, 5:03:36
|
| 380 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31753.0 tgs: 60 data_time: 0.42s time: 523.95s eta: 3 days, 5:17:00
|
| 382 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 32.8GB text_tokens: 31116.0 tgs: 59 data_time: 0.66s time: 518.86s eta: 3 days, 4:23:15
|
| 384 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31233.0 tgs: 60 data_time: 0.67s time: 519.23s eta: 3 days, 4:17:54
|
| 386 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 32074.0 tgs: 61 data_time: 0.71s time: 520.72s eta: 3 days, 4:22:22
|
| 388 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31823.0 tgs: 60 data_time: 0.79s time: 524.26s eta: 3 days, 4:44:46
|
| 390 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32008.0 tgs: 61 data_time: 0.71s time: 520.28s eta: 3 days, 4:01:06
|
| 392 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.0GB text_tokens: 32207.0 tgs: 61 data_time: 0.73s time: 519.50s eta: 3 days, 3:45:39
|
| 394 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 31][DP 7][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.377 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 32.7GB text_tokens: 31813.0 tgs: 60 data_time: 0.65s time: 522.33s eta: 3 days, 4:01:39
|
20250120235238/rank33.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.15s
|
| 12 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.28 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31323.0 tgs: 57 data_time: 1.86s time: 547.88s eta: 3 days, 18:14:52
|
| 258 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31388.0 tgs: 59 data_time: 0.80s time: 523.24s eta: 3 days, 14:02:36
|
| 260 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.0GB text_tokens: 31981.0 tgs: 61 data_time: 1.00s time: 522.91s eta: 3 days, 13:50:39
|
| 262 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31503.0 tgs: 60 data_time: 1.25s time: 520.28s eta: 3 days, 13:16:07
|
| 264 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.209 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31606.0 tgs: 60 data_time: 0.64s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31195.0 tgs: 59 data_time: 0.74s time: 520.89s eta: 3 days, 13:04:44
|
| 268 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31593.0 tgs: 60 data_time: 0.74s time: 523.38s eta: 3 days, 13:20:23
|
| 270 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31176.0 tgs: 59 data_time: 0.66s time: 520.66s eta: 3 days, 12:45:04
|
| 272 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32152.0 tgs: 61 data_time: 0.66s time: 520.17s eta: 3 days, 12:31:40
|
| 274 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.0GB text_tokens: 29940.0 tgs: 57 data_time: 0.65s time: 520.43s eta: 3 days, 12:25:30
|
| 276 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31618.0 tgs: 60 data_time: 0.92s time: 524.53s eta: 3 days, 12:56:39
|
| 278 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.333 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31523.0 tgs: 60 data_time: 0.64s time: 520.64s eta: 3 days, 12:10:13
|
| 280 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31105.0 tgs: 59 data_time: 0.83s time: 519.97s eta: 3 days, 11:54:59
|
| 282 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31143.0 tgs: 59 data_time: 0.51s time: 521.28s eta: 3 days, 11:59:02
|
| 284 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.8GB text_tokens: 31851.0 tgs: 60 data_time: 0.57s time: 524.12s eta: 3 days, 12:17:44
|
| 286 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31775.0 tgs: 61 data_time: 0.87s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 31815.0 tgs: 61 data_time: 0.90s time: 518.84s eta: 3 days, 11:09:33
|
| 290 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31366.0 tgs: 60 data_time: 0.78s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31364.0 tgs: 59 data_time: 0.83s time: 523.82s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.7GB text_tokens: 32004.0 tgs: 61 data_time: 0.93s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31624.0 tgs: 60 data_time: 0.84s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.358 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31087.0 tgs: 59 data_time: 0.68s time: 522.83s eta: 3 days, 11:04:20
|
| 300 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.57s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.5GB text_tokens: 31237.0 tgs: 59 data_time: 0.80s time: 520.96s eta: 3 days, 10:29:06
|
| 304 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31187.0 tgs: 60 data_time: 0.75s time: 519.09s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31088.0 tgs: 59 data_time: 0.66s time: 523.29s eta: 3 days, 10:33:46
|
| 308 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.76s time: 522.95s eta: 3 days, 10:21:54
|
| 310 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 32039.0 tgs: 61 data_time: 0.95s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32252.0 tgs: 61 data_time: 0.53s time: 520.63s eta: 3 days, 9:42:36
|
| 314 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 0.95s time: 522.43s eta: 3 days, 9:50:50
|
| 316 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 31516.0 tgs: 60 data_time: 0.59s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 29894.0 tgs: 57 data_time: 0.80s time: 520.42s eta: 3 days, 9:14:35
|
| 320 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31299.0 tgs: 60 data_time: 0.85s time: 520.67s eta: 3 days, 9:08:16
|
| 322 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31641.0 tgs: 60 data_time: 0.82s time: 521.31s eta: 3 days, 9:05:31
|
| 324 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31053.0 tgs: 59 data_time: 0.54s time: 524.10s eta: 3 days, 9:22:54
|
| 326 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31452.0 tgs: 60 data_time: 0.83s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.7GB text_tokens: 31537.0 tgs: 60 data_time: 0.81s time: 520.17s eta: 3 days, 8:28:52
|
| 330 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.375 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32333.0 tgs: 61 data_time: 0.85s time: 522.56s eta: 3 days, 8:42:21
|
| 332 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.215 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32264.0 tgs: 61 data_time: 0.75s time: 524.33s eta: 3 days, 8:50:00
|
| 334 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31571.0 tgs: 60 data_time: 0.62s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 32023.0 tgs: 61 data_time: 0.52s time: 520.13s eta: 3 days, 7:53:50
|
| 338 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32305.0 tgs: 61 data_time: 0.78s time: 521.99s eta: 3 days, 8:02:19
|
| 340 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32197.0 tgs: 61 data_time: 1.01s time: 524.35s eta: 3 days, 8:15:14
|
| 342 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.8GB text_tokens: 31127.0 tgs: 59 data_time: 0.59s time: 520.52s eta: 3 days, 7:31:28
|
| 344 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.8GB text_tokens: 31389.0 tgs: 60 data_time: 0.82s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.76s time: 523.15s eta: 3 days, 7:38:05
|
| 348 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31744.0 tgs: 60 data_time: 1.19s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.213 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31146.0 tgs: 59 data_time: 0.96s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 30841.0 tgs: 59 data_time: 0.81s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 30859.0 tgs: 58 data_time: 1.13s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31436.0 tgs: 60 data_time: 0.85s time: 523.80s eta: 3 days, 7:00:25
|
| 358 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32042.0 tgs: 61 data_time: 0.92s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.9GB text_tokens: 31299.0 tgs: 60 data_time: 0.64s time: 518.77s eta: 3 days, 5:57:34
|
| 362 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 31820.0 tgs: 60 data_time: 0.54s time: 522.68s eta: 3 days, 6:24:06
|
| 364 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.65s time: 522.73s eta: 3 days, 6:15:52
|
| 366 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 30595.0 tgs: 58 data_time: 0.73s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32038.0 tgs: 61 data_time: 0.70s time: 519.65s eta: 3 days, 5:30:53
|
| 370 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 31912.0 tgs: 61 data_time: 0.76s time: 521.32s eta: 3 days, 5:37:06
|
| 372 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 31973.0 tgs: 61 data_time: 0.91s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 32142.0 tgs: 61 data_time: 0.67s time: 519.32s eta: 3 days, 5:01:54
|
| 376 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32223.0 tgs: 61 data_time: 0.78s time: 520.66s eta: 3 days, 5:05:12
|
| 378 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 32203.0 tgs: 61 data_time: 0.61s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 30992.0 tgs: 59 data_time: 0.69s time: 523.94s eta: 3 days, 5:16:51
|
| 382 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 32317.0 tgs: 62 data_time: 0.99s time: 518.84s eta: 3 days, 4:23:07
|
| 384 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31408.0 tgs: 60 data_time: 0.68s time: 519.27s eta: 3 days, 4:18:12
|
| 386 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31491.0 tgs: 60 data_time: 0.75s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 30711.0 tgs: 58 data_time: 0.83s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 30847.0 tgs: 59 data_time: 0.75s time: 520.32s eta: 3 days, 4:01:28
|
| 392 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32021.0 tgs: 61 data_time: 0.90s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 33][DP 8][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32505.0 tgs: 62 data_time: 0.96s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank34.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.22 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31323.0 tgs: 57 data_time: 1.90s time: 547.83s eta: 3 days, 18:14:23
|
| 258 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31388.0 tgs: 59 data_time: 0.88s time: 523.24s eta: 3 days, 14:02:35
|
| 260 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.0GB text_tokens: 31981.0 tgs: 61 data_time: 1.13s time: 522.91s eta: 3 days, 13:50:36
|
| 262 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31503.0 tgs: 60 data_time: 1.38s time: 520.28s eta: 3 days, 13:16:07
|
| 264 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31606.0 tgs: 60 data_time: 0.68s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31195.0 tgs: 59 data_time: 0.80s time: 520.89s eta: 3 days, 13:04:44
|
| 268 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31593.0 tgs: 60 data_time: 0.78s time: 523.38s eta: 3 days, 13:20:22
|
| 270 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31176.0 tgs: 59 data_time: 0.68s time: 520.66s eta: 3 days, 12:45:03
|
| 272 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32152.0 tgs: 61 data_time: 0.70s time: 520.17s eta: 3 days, 12:31:40
|
| 274 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.194 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.0GB text_tokens: 29940.0 tgs: 57 data_time: 0.69s time: 520.43s eta: 3 days, 12:25:29
|
| 276 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31618.0 tgs: 60 data_time: 0.97s time: 524.52s eta: 3 days, 12:56:38
|
| 278 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31523.0 tgs: 60 data_time: 0.68s time: 520.64s eta: 3 days, 12:10:14
|
| 280 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31105.0 tgs: 59 data_time: 0.90s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31143.0 tgs: 59 data_time: 0.62s time: 521.28s eta: 3 days, 11:59:03
|
| 284 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.8GB text_tokens: 31851.0 tgs: 60 data_time: 0.60s time: 524.12s eta: 3 days, 12:17:43
|
| 286 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31775.0 tgs: 61 data_time: 0.89s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 31815.0 tgs: 61 data_time: 0.93s time: 518.84s eta: 3 days, 11:09:32
|
| 290 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31366.0 tgs: 60 data_time: 0.85s time: 522.10s eta: 3 days, 11:32:07
|
| 292 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.190 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31364.0 tgs: 59 data_time: 0.86s time: 523.82s eta: 3 days, 11:39:57
|
| 294 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.7GB text_tokens: 32004.0 tgs: 61 data_time: 0.99s time: 520.47s eta: 3 days, 10:59:08
|
| 296 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.372 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31624.0 tgs: 61 data_time: 0.87s time: 518.43s eta: 3 days, 10:30:57
|
| 298 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31087.0 tgs: 59 data_time: 0.73s time: 522.83s eta: 3 days, 11:04:20
|
| 300 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.60s time: 523.53s eta: 3 days, 11:02:17
|
| 302 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.5GB text_tokens: 31237.0 tgs: 59 data_time: 0.85s time: 520.96s eta: 3 days, 10:29:09
|
| 304 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31187.0 tgs: 60 data_time: 0.79s time: 519.09s eta: 3 days, 10:02:44
|
| 306 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31088.0 tgs: 59 data_time: 0.69s time: 523.29s eta: 3 days, 10:33:45
|
| 308 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.81s time: 522.95s eta: 3 days, 10:21:53
|
| 310 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 32039.0 tgs: 61 data_time: 0.97s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32252.0 tgs: 61 data_time: 0.58s time: 520.63s eta: 3 days, 9:42:37
|
| 314 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 1.04s time: 522.42s eta: 3 days, 9:50:47
|
| 316 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 31516.0 tgs: 60 data_time: 0.62s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 29894.0 tgs: 57 data_time: 0.82s time: 520.42s eta: 3 days, 9:14:35
|
| 320 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31299.0 tgs: 60 data_time: 0.91s time: 520.67s eta: 3 days, 9:08:16
|
| 322 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31641.0 tgs: 60 data_time: 0.85s time: 521.30s eta: 3 days, 9:05:30
|
| 324 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.199 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31053.0 tgs: 59 data_time: 0.56s time: 524.10s eta: 3 days, 9:22:53
|
| 326 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31452.0 tgs: 60 data_time: 0.84s time: 520.95s eta: 3 days, 8:44:50
|
| 328 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.7GB text_tokens: 31537.0 tgs: 60 data_time: 0.83s time: 520.16s eta: 3 days, 8:28:51
|
| 330 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32333.0 tgs: 61 data_time: 0.88s time: 522.56s eta: 3 days, 8:42:21
|
| 332 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32264.0 tgs: 61 data_time: 0.79s time: 524.32s eta: 3 days, 8:49:59
|
| 334 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31571.0 tgs: 60 data_time: 0.66s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 32023.0 tgs: 61 data_time: 0.55s time: 520.13s eta: 3 days, 7:53:49
|
| 338 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32305.0 tgs: 61 data_time: 0.79s time: 521.99s eta: 3 days, 8:02:18
|
| 340 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32197.0 tgs: 61 data_time: 1.05s time: 524.35s eta: 3 days, 8:15:15
|
| 342 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.8GB text_tokens: 31127.0 tgs: 59 data_time: 0.62s time: 520.52s eta: 3 days, 7:31:27
|
| 344 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.8GB text_tokens: 31389.0 tgs: 60 data_time: 0.86s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.77s time: 523.15s eta: 3 days, 7:38:04
|
| 348 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31744.0 tgs: 60 data_time: 1.21s time: 523.62s eta: 3 days, 7:33:37
|
| 350 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31146.0 tgs: 59 data_time: 1.00s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 30841.0 tgs: 59 data_time: 0.84s time: 520.04s eta: 3 days, 6:43:44
|
| 354 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 30859.0 tgs: 58 data_time: 1.20s time: 523.75s eta: 3 days, 7:08:40
|
| 356 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31436.0 tgs: 60 data_time: 0.87s time: 523.80s eta: 3 days, 7:00:25
|
| 358 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32042.0 tgs: 61 data_time: 0.93s time: 520.23s eta: 3 days, 6:19:25
|
| 360 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.9GB text_tokens: 31299.0 tgs: 60 data_time: 0.65s time: 518.77s eta: 3 days, 5:57:33
|
| 362 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.341 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 31820.0 tgs: 60 data_time: 0.56s time: 522.68s eta: 3 days, 6:24:06
|
| 364 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.68s time: 522.73s eta: 3 days, 6:15:52
|
| 366 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 30595.0 tgs: 58 data_time: 0.79s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32038.0 tgs: 61 data_time: 0.72s time: 519.65s eta: 3 days, 5:30:51
|
| 370 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 31912.0 tgs: 61 data_time: 0.81s time: 521.32s eta: 3 days, 5:37:06
|
| 372 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 31973.0 tgs: 61 data_time: 0.94s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 32142.0 tgs: 61 data_time: 0.69s time: 519.31s eta: 3 days, 5:01:53
|
| 376 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32223.0 tgs: 61 data_time: 0.81s time: 520.66s eta: 3 days, 5:05:11
|
| 378 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 32203.0 tgs: 61 data_time: 0.71s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 30992.0 tgs: 59 data_time: 0.71s time: 523.94s eta: 3 days, 5:16:51
|
| 382 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.370 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 32317.0 tgs: 62 data_time: 1.01s time: 518.84s eta: 3 days, 4:23:07
|
| 384 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31408.0 tgs: 60 data_time: 0.72s time: 519.27s eta: 3 days, 4:18:11
|
| 386 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31491.0 tgs: 60 data_time: 0.77s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 30711.0 tgs: 58 data_time: 0.84s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 30847.0 tgs: 59 data_time: 0.77s time: 520.32s eta: 3 days, 4:01:29
|
| 392 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32021.0 tgs: 61 data_time: 0.94s time: 519.48s eta: 3 days, 3:45:29
|
| 394 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 34][DP 8][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.372 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32505.0 tgs: 62 data_time: 0.98s time: 522.32s eta: 3 days, 4:01:33
|
20250120235238/rank35.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.11s
|
| 12 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.28 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31323.0 tgs: 57 data_time: 1.87s time: 547.79s eta: 3 days, 18:14:00
|
| 258 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31388.0 tgs: 59 data_time: 0.88s time: 523.24s eta: 3 days, 14:02:36
|
| 260 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.0GB text_tokens: 31981.0 tgs: 61 data_time: 1.02s time: 522.91s eta: 3 days, 13:50:38
|
| 262 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31503.0 tgs: 60 data_time: 1.38s time: 520.29s eta: 3 days, 13:16:08
|
| 264 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31606.0 tgs: 60 data_time: 0.67s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31195.0 tgs: 59 data_time: 0.78s time: 520.89s eta: 3 days, 13:04:45
|
| 268 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 31593.0 tgs: 60 data_time: 0.75s time: 523.38s eta: 3 days, 13:20:22
|
| 270 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31176.0 tgs: 59 data_time: 0.70s time: 520.66s eta: 3 days, 12:45:04
|
| 272 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32152.0 tgs: 61 data_time: 0.71s time: 520.17s eta: 3 days, 12:31:40
|
| 274 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.0GB text_tokens: 29940.0 tgs: 57 data_time: 0.71s time: 520.43s eta: 3 days, 12:25:30
|
| 276 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31618.0 tgs: 60 data_time: 0.97s time: 524.53s eta: 3 days, 12:56:39
|
| 278 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31523.0 tgs: 60 data_time: 0.67s time: 520.64s eta: 3 days, 12:10:14
|
| 280 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31105.0 tgs: 59 data_time: 0.88s time: 519.96s eta: 3 days, 11:54:59
|
| 282 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31143.0 tgs: 59 data_time: 0.59s time: 521.28s eta: 3 days, 11:59:03
|
| 284 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.8GB text_tokens: 31851.0 tgs: 60 data_time: 0.60s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31775.0 tgs: 61 data_time: 0.89s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 31815.0 tgs: 61 data_time: 0.93s time: 518.84s eta: 3 days, 11:09:32
|
| 290 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31366.0 tgs: 60 data_time: 0.87s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31364.0 tgs: 59 data_time: 0.87s time: 523.82s eta: 3 days, 11:39:58
|
| 294 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.7GB text_tokens: 32004.0 tgs: 61 data_time: 0.98s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31624.0 tgs: 61 data_time: 0.86s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31087.0 tgs: 59 data_time: 0.73s time: 522.83s eta: 3 days, 11:04:21
|
| 300 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.61s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.5GB text_tokens: 31237.0 tgs: 59 data_time: 0.87s time: 520.96s eta: 3 days, 10:29:06
|
| 304 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31187.0 tgs: 60 data_time: 0.77s time: 519.09s eta: 3 days, 10:02:44
|
| 306 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31088.0 tgs: 59 data_time: 0.69s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31662.0 tgs: 60 data_time: 0.80s time: 522.95s eta: 3 days, 10:21:54
|
| 310 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 32039.0 tgs: 61 data_time: 1.01s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32252.0 tgs: 61 data_time: 0.57s time: 520.63s eta: 3 days, 9:42:36
|
| 314 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 1.07s time: 522.43s eta: 3 days, 9:50:49
|
| 316 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 31516.0 tgs: 60 data_time: 0.62s time: 523.75s eta: 3 days, 9:54:33
|
| 318 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 29894.0 tgs: 57 data_time: 0.83s time: 520.42s eta: 3 days, 9:14:35
|
| 320 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31299.0 tgs: 60 data_time: 0.89s time: 520.67s eta: 3 days, 9:08:17
|
| 322 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31641.0 tgs: 60 data_time: 0.85s time: 521.31s eta: 3 days, 9:05:31
|
| 324 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31053.0 tgs: 59 data_time: 0.56s time: 524.10s eta: 3 days, 9:22:53
|
| 326 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31452.0 tgs: 60 data_time: 0.89s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.7GB text_tokens: 31537.0 tgs: 60 data_time: 0.83s time: 520.17s eta: 3 days, 8:28:52
|
| 330 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32333.0 tgs: 61 data_time: 0.88s time: 522.56s eta: 3 days, 8:42:21
|
| 332 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32264.0 tgs: 61 data_time: 0.79s time: 524.32s eta: 3 days, 8:50:00
|
| 334 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.213 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31571.0 tgs: 60 data_time: 0.66s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.205 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 32023.0 tgs: 61 data_time: 0.55s time: 520.13s eta: 3 days, 7:53:50
|
| 338 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.366 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32305.0 tgs: 61 data_time: 0.79s time: 521.99s eta: 3 days, 8:02:19
|
| 340 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32197.0 tgs: 61 data_time: 1.07s time: 524.35s eta: 3 days, 8:15:14
|
| 342 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.8GB text_tokens: 31127.0 tgs: 59 data_time: 0.62s time: 520.52s eta: 3 days, 7:31:28
|
| 344 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.8GB text_tokens: 31389.0 tgs: 60 data_time: 0.86s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.213 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.79s time: 523.15s eta: 3 days, 7:38:04
|
| 348 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31744.0 tgs: 60 data_time: 1.24s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31146.0 tgs: 59 data_time: 0.99s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 30841.0 tgs: 59 data_time: 0.85s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 30859.0 tgs: 58 data_time: 1.21s time: 523.75s eta: 3 days, 7:08:40
|
| 356 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31436.0 tgs: 60 data_time: 0.87s time: 523.80s eta: 3 days, 7:00:25
|
| 358 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32042.0 tgs: 61 data_time: 0.94s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.9GB text_tokens: 31299.0 tgs: 60 data_time: 0.66s time: 518.77s eta: 3 days, 5:57:34
|
| 362 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 31820.0 tgs: 60 data_time: 0.56s time: 522.68s eta: 3 days, 6:24:06
|
| 364 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.69s time: 522.73s eta: 3 days, 6:15:53
|
| 366 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 30595.0 tgs: 58 data_time: 0.79s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.331 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32038.0 tgs: 61 data_time: 0.72s time: 519.65s eta: 3 days, 5:30:52
|
| 370 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 31912.0 tgs: 61 data_time: 0.79s time: 521.32s eta: 3 days, 5:37:07
|
| 372 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 31973.0 tgs: 61 data_time: 0.94s time: 523.58s eta: 3 days, 5:48:36
|
| 374 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 32142.0 tgs: 61 data_time: 0.73s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32223.0 tgs: 61 data_time: 0.82s time: 520.66s eta: 3 days, 5:05:12
|
| 378 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 32203.0 tgs: 61 data_time: 0.68s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 30992.0 tgs: 59 data_time: 0.72s time: 523.94s eta: 3 days, 5:16:51
|
| 382 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.363 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 32317.0 tgs: 62 data_time: 1.01s time: 518.85s eta: 3 days, 4:23:08
|
| 384 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31408.0 tgs: 60 data_time: 0.70s time: 519.27s eta: 3 days, 4:18:12
|
| 386 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31491.0 tgs: 60 data_time: 0.77s time: 520.71s eta: 3 days, 4:22:13
|
| 388 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 30711.0 tgs: 58 data_time: 0.86s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 30847.0 tgs: 59 data_time: 0.77s time: 520.32s eta: 3 days, 4:01:29
|
| 392 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32021.0 tgs: 61 data_time: 0.92s time: 519.49s eta: 3 days, 3:45:31
|
| 394 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 35][DP 8][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32505.0 tgs: 62 data_time: 0.99s time: 522.31s eta: 3 days, 4:01:32
|
20250120235238/rank38.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.11s
|
| 12 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.27 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.0GB text_tokens: 32281.0 tgs: 58 data_time: 2.29s time: 547.46s eta: 3 days, 18:10:43
|
| 258 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32206.0 tgs: 61 data_time: 0.83s time: 523.23s eta: 3 days, 14:02:32
|
| 260 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32213.0 tgs: 61 data_time: 1.00s time: 522.91s eta: 3 days, 13:50:37
|
| 262 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.1GB text_tokens: 29331.0 tgs: 56 data_time: 0.94s time: 520.28s eta: 3 days, 13:16:06
|
| 264 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 31870.0 tgs: 61 data_time: 0.89s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.4GB text_tokens: 29472.0 tgs: 56 data_time: 0.72s time: 520.89s eta: 3 days, 13:04:43
|
| 268 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.0GB text_tokens: 32120.0 tgs: 61 data_time: 0.74s time: 523.38s eta: 3 days, 13:20:25
|
| 270 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 30854.0 tgs: 59 data_time: 0.66s time: 520.65s eta: 3 days, 12:45:01
|
| 272 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 32.8GB text_tokens: 31900.0 tgs: 61 data_time: 0.73s time: 520.17s eta: 3 days, 12:31:41
|
| 274 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.0GB text_tokens: 32109.0 tgs: 61 data_time: 0.81s time: 520.43s eta: 3 days, 12:25:29
|
| 276 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31161.0 tgs: 59 data_time: 0.64s time: 524.52s eta: 3 days, 12:56:37
|
| 278 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.7GB text_tokens: 31536.0 tgs: 60 data_time: 0.79s time: 520.64s eta: 3 days, 12:10:13
|
| 280 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 32004.0 tgs: 61 data_time: 0.78s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32359.0 tgs: 62 data_time: 0.81s time: 521.28s eta: 3 days, 11:59:03
|
| 284 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 32408.0 tgs: 61 data_time: 1.02s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.326 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31413.0 tgs: 60 data_time: 0.98s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31857.0 tgs: 61 data_time: 0.78s time: 518.84s eta: 3 days, 11:09:33
|
| 290 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 32.9GB text_tokens: 31570.0 tgs: 60 data_time: 0.72s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.1GB text_tokens: 31566.0 tgs: 60 data_time: 0.67s time: 523.82s eta: 3 days, 11:39:58
|
| 294 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32203.0 tgs: 61 data_time: 0.84s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 30573.0 tgs: 58 data_time: 0.68s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.5GB text_tokens: 30164.0 tgs: 57 data_time: 0.71s time: 522.83s eta: 3 days, 11:04:20
|
| 300 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31721.0 tgs: 60 data_time: 0.80s time: 523.53s eta: 3 days, 11:02:17
|
| 302 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.7GB text_tokens: 31235.0 tgs: 59 data_time: 0.81s time: 520.96s eta: 3 days, 10:29:09
|
| 304 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 32083.0 tgs: 61 data_time: 0.72s time: 519.10s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.7GB text_tokens: 31111.0 tgs: 59 data_time: 0.91s time: 523.29s eta: 3 days, 10:33:46
|
| 308 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.0GB text_tokens: 31722.0 tgs: 60 data_time: 0.85s time: 522.95s eta: 3 days, 10:21:53
|
| 310 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.200 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.8GB text_tokens: 30903.0 tgs: 59 data_time: 0.79s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 31566.0 tgs: 60 data_time: 1.07s time: 520.63s eta: 3 days, 9:42:35
|
| 314 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 28740.0 tgs: 55 data_time: 0.83s time: 522.43s eta: 3 days, 9:50:50
|
| 316 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31943.0 tgs: 60 data_time: 0.64s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 31226.0 tgs: 60 data_time: 0.86s time: 520.42s eta: 3 days, 9:14:36
|
| 320 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31930.0 tgs: 61 data_time: 0.69s time: 520.67s eta: 3 days, 9:08:16
|
| 322 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 0.90s time: 521.31s eta: 3 days, 9:05:31
|
| 324 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 32155.0 tgs: 61 data_time: 0.83s time: 524.10s eta: 3 days, 9:22:53
|
| 326 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31819.0 tgs: 61 data_time: 0.92s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.204 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 32555.0 tgs: 62 data_time: 0.64s time: 520.17s eta: 3 days, 8:28:52
|
| 330 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 31507.0 tgs: 60 data_time: 0.75s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32485.0 tgs: 61 data_time: 0.89s time: 524.32s eta: 3 days, 8:49:59
|
| 334 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 32286.0 tgs: 62 data_time: 0.93s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32270.0 tgs: 62 data_time: 0.79s time: 520.13s eta: 3 days, 7:53:51
|
| 338 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31886.0 tgs: 61 data_time: 0.94s time: 521.99s eta: 3 days, 8:02:18
|
| 340 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.6GB text_tokens: 31139.0 tgs: 59 data_time: 0.75s time: 524.35s eta: 3 days, 8:15:15
|
| 342 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31827.0 tgs: 61 data_time: 0.68s time: 520.52s eta: 3 days, 7:31:28
|
| 344 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 32014.0 tgs: 61 data_time: 0.87s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31950.0 tgs: 61 data_time: 1.07s time: 523.15s eta: 3 days, 7:38:04
|
| 348 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 31159.0 tgs: 59 data_time: 0.62s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.0GB text_tokens: 31876.0 tgs: 61 data_time: 0.97s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31681.0 tgs: 60 data_time: 0.92s time: 520.05s eta: 3 days, 6:43:47
|
| 354 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31594.0 tgs: 60 data_time: 1.19s time: 523.75s eta: 3 days, 7:08:40
|
| 356 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.9GB text_tokens: 31032.0 tgs: 59 data_time: 1.08s time: 523.80s eta: 3 days, 7:00:24
|
| 358 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31014.0 tgs: 59 data_time: 1.00s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32168.0 tgs: 62 data_time: 0.98s time: 518.77s eta: 3 days, 5:57:35
|
| 362 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.353 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.9GB text_tokens: 31813.0 tgs: 60 data_time: 0.74s time: 522.68s eta: 3 days, 6:24:06
|
| 364 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 32389.0 tgs: 61 data_time: 0.93s time: 522.73s eta: 3 days, 6:15:53
|
| 366 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 31439.0 tgs: 60 data_time: 0.69s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31610.0 tgs: 60 data_time: 0.60s time: 519.65s eta: 3 days, 5:30:52
|
| 370 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 31493.0 tgs: 60 data_time: 0.87s time: 521.32s eta: 3 days, 5:37:06
|
| 372 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.1GB text_tokens: 31578.0 tgs: 60 data_time: 0.80s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 32478.0 tgs: 62 data_time: 1.06s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 32.8GB text_tokens: 29943.0 tgs: 57 data_time: 0.83s time: 520.66s eta: 3 days, 5:05:12
|
| 378 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31038.0 tgs: 59 data_time: 0.57s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.8GB text_tokens: 31043.0 tgs: 59 data_time: 0.84s time: 523.94s eta: 3 days, 5:16:51
|
| 382 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32383.0 tgs: 62 data_time: 0.86s time: 518.85s eta: 3 days, 4:23:08
|
| 384 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31991.0 tgs: 61 data_time: 0.69s time: 519.27s eta: 3 days, 4:18:12
|
| 386 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.5GB text_tokens: 30412.0 tgs: 58 data_time: 0.83s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 31377.0 tgs: 59 data_time: 1.03s time: 524.25s eta: 3 days, 4:44:37
|
| 390 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.319 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.9GB text_tokens: 31712.0 tgs: 60 data_time: 0.75s time: 520.32s eta: 3 days, 4:01:29
|
| 392 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32481.0 tgs: 62 data_time: 0.98s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 38][DP 9][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31743.0 tgs: 60 data_time: 0.99s time: 522.31s eta: 3 days, 4:01:30
|
20250120235238/rank4.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.30 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.6GB text_tokens: 31121.0 tgs: 56 data_time: 2.29s time: 546.73s eta: 3 days, 18:03:33
|
| 258 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.344 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31200.0 tgs: 59 data_time: 0.96s time: 523.22s eta: 3 days, 14:02:28
|
| 260 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 30861.0 tgs: 59 data_time: 0.97s time: 522.94s eta: 3 days, 13:50:56
|
| 262 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 1.16s time: 520.27s eta: 3 days, 13:16:01
|
| 264 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.24s time: 520.97s eta: 3 days, 13:14:09
|
| 266 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31899.0 tgs: 61 data_time: 0.80s time: 520.91s eta: 3 days, 13:04:55
|
| 268 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32173.0 tgs: 61 data_time: 1.03s time: 523.37s eta: 3 days, 13:20:16
|
| 270 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32073.0 tgs: 61 data_time: 0.82s time: 520.65s eta: 3 days, 12:44:59
|
| 272 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 0.86s time: 520.16s eta: 3 days, 12:31:34
|
| 274 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32211.0 tgs: 61 data_time: 0.72s time: 520.47s eta: 3 days, 12:25:56
|
| 276 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31612.0 tgs: 60 data_time: 0.94s time: 524.52s eta: 3 days, 12:56:32
|
| 278 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31664.0 tgs: 60 data_time: 0.70s time: 520.63s eta: 3 days, 12:10:09
|
| 280 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31279.0 tgs: 60 data_time: 0.68s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31574.0 tgs: 60 data_time: 0.89s time: 521.27s eta: 3 days, 11:58:58
|
| 284 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 31608.0 tgs: 60 data_time: 0.74s time: 524.11s eta: 3 days, 12:17:38
|
| 286 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32396.0 tgs: 62 data_time: 0.89s time: 520.52s eta: 3 days, 11:34:22
|
| 288 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32445.0 tgs: 62 data_time: 0.80s time: 518.89s eta: 3 days, 11:09:59
|
| 290 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32483.0 tgs: 62 data_time: 0.90s time: 522.09s eta: 3 days, 11:32:03
|
| 292 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.8GB text_tokens: 30947.0 tgs: 59 data_time: 0.89s time: 523.81s eta: 3 days, 11:39:53
|
| 294 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.395 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32390.0 tgs: 62 data_time: 0.82s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31866.0 tgs: 61 data_time: 0.87s time: 518.42s eta: 3 days, 10:30:52
|
| 298 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 31128.0 tgs: 59 data_time: 0.84s time: 522.82s eta: 3 days, 11:04:15
|
| 300 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32297.0 tgs: 61 data_time: 0.70s time: 523.52s eta: 3 days, 11:02:11
|
| 302 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.351 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32495.0 tgs: 62 data_time: 1.06s time: 521.01s eta: 3 days, 10:29:36
|
| 304 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31533.0 tgs: 60 data_time: 0.74s time: 519.09s eta: 3 days, 10:02:39
|
| 306 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.4GB text_tokens: 31074.0 tgs: 59 data_time: 0.77s time: 523.28s eta: 3 days, 10:33:40
|
| 308 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32081.0 tgs: 61 data_time: 0.90s time: 522.97s eta: 3 days, 10:22:03
|
| 310 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.208 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31457.0 tgs: 60 data_time: 0.82s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31099.0 tgs: 59 data_time: 0.73s time: 520.62s eta: 3 days, 9:42:30
|
| 314 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32202.0 tgs: 61 data_time: 0.77s time: 522.42s eta: 3 days, 9:50:44
|
| 316 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 30996.0 tgs: 59 data_time: 0.71s time: 523.77s eta: 3 days, 9:54:43
|
| 318 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31401.0 tgs: 60 data_time: 0.93s time: 520.41s eta: 3 days, 9:14:30
|
| 320 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.382 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.7GB text_tokens: 31506.0 tgs: 60 data_time: 0.90s time: 520.66s eta: 3 days, 9:08:10
|
| 322 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31849.0 tgs: 61 data_time: 0.82s time: 521.32s eta: 3 days, 9:05:39
|
| 324 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.330 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31503.0 tgs: 60 data_time: 0.67s time: 524.09s eta: 3 days, 9:22:48
|
| 326 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32241.0 tgs: 61 data_time: 0.79s time: 520.94s eta: 3 days, 8:44:45
|
| 328 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.85s time: 520.21s eta: 3 days, 8:29:18
|
| 330 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 29886.0 tgs: 57 data_time: 1.01s time: 522.55s eta: 3 days, 8:42:18
|
| 332 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.78s time: 524.32s eta: 3 days, 8:49:55
|
| 334 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31388.0 tgs: 60 data_time: 0.92s time: 519.87s eta: 3 days, 8:00:06
|
| 336 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.341 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31748.0 tgs: 61 data_time: 0.82s time: 520.12s eta: 3 days, 7:53:45
|
| 338 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 31342.0 tgs: 60 data_time: 0.70s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 31840.0 tgs: 60 data_time: 0.91s time: 524.34s eta: 3 days, 8:15:09
|
| 342 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31789.0 tgs: 61 data_time: 0.66s time: 520.56s eta: 3 days, 7:31:46
|
| 344 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31941.0 tgs: 61 data_time: 0.83s time: 518.55s eta: 3 days, 7:04:43
|
| 346 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.2GB text_tokens: 30569.0 tgs: 58 data_time: 0.77s time: 523.14s eta: 3 days, 7:37:59
|
| 348 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.87s time: 523.61s eta: 3 days, 7:33:33
|
| 350 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.6GB text_tokens: 31568.0 tgs: 60 data_time: 0.79s time: 520.20s eta: 3 days, 6:53:51
|
| 352 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31707.0 tgs: 60 data_time: 1.11s time: 520.04s eta: 3 days, 6:43:41
|
| 354 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31866.0 tgs: 60 data_time: 0.90s time: 523.74s eta: 3 days, 7:08:35
|
| 356 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.356 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31584.0 tgs: 60 data_time: 0.94s time: 523.83s eta: 3 days, 7:00:38
|
| 358 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.387 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31870.0 tgs: 61 data_time: 0.98s time: 520.22s eta: 3 days, 6:19:19
|
| 360 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.8GB text_tokens: 30551.0 tgs: 58 data_time: 0.61s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31961.0 tgs: 61 data_time: 0.82s time: 522.67s eta: 3 days, 6:24:01
|
| 364 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.332 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 1.00s time: 522.77s eta: 3 days, 6:16:11
|
| 366 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 1.08s time: 520.51s eta: 3 days, 5:47:14
|
| 368 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31804.0 tgs: 61 data_time: 0.72s time: 519.64s eta: 3 days, 5:30:47
|
| 370 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32278.0 tgs: 61 data_time: 0.99s time: 521.34s eta: 3 days, 5:37:16
|
| 372 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31927.0 tgs: 60 data_time: 0.80s time: 523.57s eta: 3 days, 5:48:31
|
| 374 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31369.0 tgs: 60 data_time: 0.98s time: 519.29s eta: 3 days, 5:01:41
|
| 376 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32214.0 tgs: 61 data_time: 0.83s time: 520.67s eta: 3 days, 5:05:15
|
| 378 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31816.0 tgs: 61 data_time: 0.97s time: 521.47s eta: 3 days, 5:03:41
|
| 380 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31958.0 tgs: 60 data_time: 0.65s time: 523.93s eta: 3 days, 5:16:46
|
| 382 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 1.03s time: 518.84s eta: 3 days, 4:23:02
|
| 384 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31565.0 tgs: 60 data_time: 0.81s time: 519.27s eta: 3 days, 4:18:15
|
| 386 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31753.0 tgs: 60 data_time: 0.71s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.5GB text_tokens: 30827.0 tgs: 58 data_time: 0.74s time: 524.24s eta: 3 days, 4:44:32
|
| 390 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.8GB text_tokens: 31476.0 tgs: 60 data_time: 1.08s time: 520.34s eta: 3 days, 4:01:38
|
| 392 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31389.0 tgs: 60 data_time: 0.44s time: 519.52s eta: 3 days, 3:45:46
|
| 394 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 4][DP 1][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 32.4GB text_tokens: 30909.0 tgs: 59 data_time: 0.91s time: 522.30s eta: 3 days, 4:01:26
|
20250120235238/rank40.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.10s
|
| 12 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.80 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.205 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32069.0 tgs: 58 data_time: 2.08s time: 547.71s eta: 3 days, 18:13:11
|
| 258 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32159.0 tgs: 61 data_time: 0.84s time: 523.25s eta: 3 days, 14:02:41
|
| 260 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31619.0 tgs: 60 data_time: 0.98s time: 522.88s eta: 3 days, 13:50:24
|
| 262 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32056.0 tgs: 61 data_time: 0.90s time: 520.29s eta: 3 days, 13:16:09
|
| 264 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31273.0 tgs: 60 data_time: 0.99s time: 520.98s eta: 3 days, 13:14:16
|
| 266 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.8GB text_tokens: 31327.0 tgs: 60 data_time: 0.93s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32396.0 tgs: 61 data_time: 0.83s time: 523.38s eta: 3 days, 13:20:24
|
| 270 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.353 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.9GB text_tokens: 31929.0 tgs: 61 data_time: 0.87s time: 520.66s eta: 3 days, 12:45:04
|
| 272 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31933.0 tgs: 61 data_time: 0.84s time: 520.17s eta: 3 days, 12:31:41
|
| 274 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 31433.0 tgs: 60 data_time: 0.82s time: 520.42s eta: 3 days, 12:25:27
|
| 276 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32287.0 tgs: 61 data_time: 1.04s time: 524.53s eta: 3 days, 12:56:40
|
| 278 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31446.0 tgs: 60 data_time: 0.76s time: 520.64s eta: 3 days, 12:10:15
|
| 280 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 31841.0 tgs: 61 data_time: 0.71s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31057.0 tgs: 59 data_time: 0.73s time: 521.28s eta: 3 days, 11:59:04
|
| 284 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32202.0 tgs: 61 data_time: 0.89s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32492.0 tgs: 62 data_time: 0.62s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31095.0 tgs: 59 data_time: 0.61s time: 518.82s eta: 3 days, 11:09:16
|
| 290 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32615.0 tgs: 62 data_time: 0.69s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.64s time: 523.83s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32340.0 tgs: 62 data_time: 0.60s time: 520.49s eta: 3 days, 10:59:18
|
| 296 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31148.0 tgs: 60 data_time: 0.80s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 30975.0 tgs: 59 data_time: 0.63s time: 522.83s eta: 3 days, 11:04:21
|
| 300 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.180 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32237.0 tgs: 61 data_time: 0.96s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31482.0 tgs: 60 data_time: 0.86s time: 520.96s eta: 3 days, 10:29:08
|
| 304 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 31615.0 tgs: 60 data_time: 0.65s time: 519.09s eta: 3 days, 10:02:40
|
| 306 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32372.0 tgs: 61 data_time: 0.93s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 27656.0 tgs: 52 data_time: 0.66s time: 522.96s eta: 3 days, 10:21:57
|
| 310 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31754.0 tgs: 60 data_time: 0.54s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.363 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32225.0 tgs: 61 data_time: 0.99s time: 520.62s eta: 3 days, 9:42:32
|
| 314 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32178.0 tgs: 61 data_time: 0.86s time: 522.42s eta: 3 days, 9:50:43
|
| 316 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 32356.0 tgs: 61 data_time: 0.93s time: 523.74s eta: 3 days, 9:54:27
|
| 318 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32084.0 tgs: 61 data_time: 0.53s time: 520.42s eta: 3 days, 9:14:38
|
| 320 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.9GB text_tokens: 31489.0 tgs: 60 data_time: 0.79s time: 520.67s eta: 3 days, 9:08:13
|
| 322 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31760.0 tgs: 60 data_time: 0.76s time: 521.30s eta: 3 days, 9:05:27
|
| 324 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31719.0 tgs: 60 data_time: 0.91s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31694.0 tgs: 60 data_time: 0.89s time: 520.95s eta: 3 days, 8:44:47
|
| 328 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31260.0 tgs: 60 data_time: 0.64s time: 520.19s eta: 3 days, 8:29:04
|
| 330 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32424.0 tgs: 62 data_time: 1.20s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 31747.0 tgs: 60 data_time: 0.70s time: 524.33s eta: 3 days, 8:50:00
|
| 334 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 31806.0 tgs: 61 data_time: 0.89s time: 519.88s eta: 3 days, 8:00:13
|
| 336 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.9GB text_tokens: 31095.0 tgs: 59 data_time: 0.81s time: 520.12s eta: 3 days, 7:53:43
|
| 338 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31720.0 tgs: 60 data_time: 0.95s time: 521.99s eta: 3 days, 8:02:17
|
| 340 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 31981.0 tgs: 60 data_time: 1.09s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31115.0 tgs: 59 data_time: 0.84s time: 520.52s eta: 3 days, 7:31:24
|
| 344 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 32109.0 tgs: 61 data_time: 0.79s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.348 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31354.0 tgs: 59 data_time: 0.75s time: 523.15s eta: 3 days, 7:38:04
|
| 348 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31295.0 tgs: 59 data_time: 0.72s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32080.0 tgs: 61 data_time: 0.76s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 0.60s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 32091.0 tgs: 61 data_time: 0.80s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.7GB text_tokens: 31199.0 tgs: 59 data_time: 0.95s time: 523.79s eta: 3 days, 7:00:16
|
| 358 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.9GB text_tokens: 31340.0 tgs: 60 data_time: 0.76s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.77s time: 518.77s eta: 3 days, 5:57:35
|
| 362 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 31919.0 tgs: 61 data_time: 0.62s time: 522.68s eta: 3 days, 6:24:05
|
| 364 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.5GB text_tokens: 31153.0 tgs: 59 data_time: 0.79s time: 522.75s eta: 3 days, 6:16:00
|
| 366 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.9GB text_tokens: 31392.0 tgs: 60 data_time: 0.89s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 31449.0 tgs: 60 data_time: 0.82s time: 519.65s eta: 3 days, 5:30:52
|
| 370 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32236.0 tgs: 61 data_time: 0.75s time: 521.31s eta: 3 days, 5:36:59
|
| 372 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 31580.0 tgs: 60 data_time: 0.64s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31801.0 tgs: 61 data_time: 0.66s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31747.0 tgs: 60 data_time: 0.80s time: 520.66s eta: 3 days, 5:05:13
|
| 378 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31667.0 tgs: 60 data_time: 0.99s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31543.0 tgs: 60 data_time: 0.63s time: 523.94s eta: 3 days, 5:16:51
|
| 382 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32204.0 tgs: 62 data_time: 0.65s time: 518.84s eta: 3 days, 4:23:07
|
| 384 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.316 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.7GB text_tokens: 30805.0 tgs: 59 data_time: 0.73s time: 519.27s eta: 3 days, 4:18:11
|
| 386 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31921.0 tgs: 61 data_time: 0.92s time: 520.71s eta: 3 days, 4:22:13
|
| 388 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 32335.0 tgs: 61 data_time: 0.75s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32297.0 tgs: 62 data_time: 0.84s time: 520.31s eta: 3 days, 4:01:25
|
| 392 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31866.0 tgs: 61 data_time: 0.76s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 40][DP 10][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32561.0 tgs: 62 data_time: 0.59s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank42.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.10s
|
| 12 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.80 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32069.0 tgs: 58 data_time: 2.01s time: 547.70s eta: 3 days, 18:13:08
|
| 258 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32159.0 tgs: 61 data_time: 0.80s time: 523.24s eta: 3 days, 14:02:40
|
| 260 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31619.0 tgs: 60 data_time: 0.98s time: 522.89s eta: 3 days, 13:50:26
|
| 262 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32056.0 tgs: 61 data_time: 0.87s time: 520.29s eta: 3 days, 13:16:08
|
| 264 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31273.0 tgs: 60 data_time: 0.98s time: 520.98s eta: 3 days, 13:14:16
|
| 266 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.8GB text_tokens: 31327.0 tgs: 60 data_time: 0.91s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32396.0 tgs: 61 data_time: 0.79s time: 523.38s eta: 3 days, 13:20:23
|
| 270 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.9GB text_tokens: 31929.0 tgs: 61 data_time: 0.83s time: 520.66s eta: 3 days, 12:45:04
|
| 272 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31933.0 tgs: 61 data_time: 0.81s time: 520.17s eta: 3 days, 12:31:41
|
| 274 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.377 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 31433.0 tgs: 60 data_time: 0.80s time: 520.42s eta: 3 days, 12:25:26
|
| 276 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32287.0 tgs: 61 data_time: 1.03s time: 524.53s eta: 3 days, 12:56:39
|
| 278 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31446.0 tgs: 60 data_time: 0.73s time: 520.64s eta: 3 days, 12:10:14
|
| 280 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 31841.0 tgs: 61 data_time: 0.71s time: 519.96s eta: 3 days, 11:54:59
|
| 282 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31057.0 tgs: 59 data_time: 0.71s time: 521.28s eta: 3 days, 11:59:04
|
| 284 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32202.0 tgs: 61 data_time: 0.95s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32492.0 tgs: 62 data_time: 0.63s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31095.0 tgs: 59 data_time: 0.62s time: 518.82s eta: 3 days, 11:09:16
|
| 290 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32615.0 tgs: 62 data_time: 0.68s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.62s time: 523.83s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32340.0 tgs: 62 data_time: 0.59s time: 520.48s eta: 3 days, 10:59:18
|
| 296 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31148.0 tgs: 60 data_time: 0.75s time: 518.43s eta: 3 days, 10:30:59
|
| 298 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 30975.0 tgs: 59 data_time: 0.62s time: 522.83s eta: 3 days, 11:04:21
|
| 300 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32237.0 tgs: 61 data_time: 0.85s time: 523.53s eta: 3 days, 11:02:18
|
| 302 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31482.0 tgs: 60 data_time: 0.85s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 31615.0 tgs: 60 data_time: 0.66s time: 519.10s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.343 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32372.0 tgs: 61 data_time: 0.93s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 27656.0 tgs: 52 data_time: 0.66s time: 522.96s eta: 3 days, 10:21:57
|
| 310 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31754.0 tgs: 60 data_time: 0.54s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32225.0 tgs: 61 data_time: 0.99s time: 520.63s eta: 3 days, 9:42:37
|
| 314 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32178.0 tgs: 61 data_time: 0.86s time: 522.42s eta: 3 days, 9:50:42
|
| 316 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 32356.0 tgs: 61 data_time: 0.95s time: 523.75s eta: 3 days, 9:54:29
|
| 318 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32084.0 tgs: 61 data_time: 0.53s time: 520.42s eta: 3 days, 9:14:36
|
| 320 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.9GB text_tokens: 31489.0 tgs: 60 data_time: 0.79s time: 520.67s eta: 3 days, 9:08:16
|
| 322 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.317 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31760.0 tgs: 60 data_time: 0.80s time: 521.30s eta: 3 days, 9:05:26
|
| 324 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31719.0 tgs: 60 data_time: 0.91s time: 524.11s eta: 3 days, 9:22:55
|
| 326 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31694.0 tgs: 60 data_time: 0.89s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31260.0 tgs: 60 data_time: 0.54s time: 520.19s eta: 3 days, 8:29:04
|
| 330 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.372 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32424.0 tgs: 62 data_time: 1.19s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 31747.0 tgs: 60 data_time: 0.69s time: 524.33s eta: 3 days, 8:50:01
|
| 334 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 31806.0 tgs: 61 data_time: 0.88s time: 519.88s eta: 3 days, 8:00:13
|
| 336 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.9GB text_tokens: 31095.0 tgs: 59 data_time: 0.80s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31720.0 tgs: 60 data_time: 0.95s time: 521.99s eta: 3 days, 8:02:20
|
| 340 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 31981.0 tgs: 60 data_time: 1.08s time: 524.35s eta: 3 days, 8:15:14
|
| 342 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31115.0 tgs: 59 data_time: 0.85s time: 520.52s eta: 3 days, 7:31:27
|
| 344 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 32109.0 tgs: 61 data_time: 0.80s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31354.0 tgs: 59 data_time: 0.74s time: 523.15s eta: 3 days, 7:38:05
|
| 348 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31295.0 tgs: 59 data_time: 0.70s time: 523.62s eta: 3 days, 7:33:40
|
| 350 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.317 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32080.0 tgs: 61 data_time: 0.76s time: 520.18s eta: 3 days, 6:53:39
|
| 352 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 0.60s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 32091.0 tgs: 61 data_time: 0.80s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.7GB text_tokens: 31199.0 tgs: 59 data_time: 0.94s time: 523.79s eta: 3 days, 7:00:18
|
| 358 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.9GB text_tokens: 31340.0 tgs: 60 data_time: 0.76s time: 520.23s eta: 3 days, 6:19:27
|
| 360 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.77s time: 518.77s eta: 3 days, 5:57:35
|
| 362 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 31919.0 tgs: 61 data_time: 0.62s time: 522.68s eta: 3 days, 6:24:07
|
| 364 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.5GB text_tokens: 31153.0 tgs: 59 data_time: 0.79s time: 522.75s eta: 3 days, 6:16:00
|
| 366 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.9GB text_tokens: 31392.0 tgs: 60 data_time: 0.89s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 31449.0 tgs: 60 data_time: 0.82s time: 519.65s eta: 3 days, 5:30:53
|
| 370 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.210 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32236.0 tgs: 61 data_time: 0.75s time: 521.31s eta: 3 days, 5:37:00
|
| 372 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 31580.0 tgs: 60 data_time: 0.64s time: 523.59s eta: 3 days, 5:48:38
|
| 374 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31801.0 tgs: 61 data_time: 0.61s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31747.0 tgs: 60 data_time: 0.78s time: 520.66s eta: 3 days, 5:05:13
|
| 378 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31667.0 tgs: 60 data_time: 0.99s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31543.0 tgs: 60 data_time: 0.62s time: 523.94s eta: 3 days, 5:16:52
|
| 382 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32204.0 tgs: 62 data_time: 0.64s time: 518.85s eta: 3 days, 4:23:09
|
| 384 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.7GB text_tokens: 30805.0 tgs: 59 data_time: 0.73s time: 519.26s eta: 3 days, 4:18:10
|
| 386 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31921.0 tgs: 61 data_time: 0.92s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 32335.0 tgs: 61 data_time: 0.75s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32297.0 tgs: 62 data_time: 0.84s time: 520.31s eta: 3 days, 4:01:24
|
| 392 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31866.0 tgs: 61 data_time: 0.73s time: 519.49s eta: 3 days, 3:45:33
|
| 394 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 42][DP 10][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32561.0 tgs: 62 data_time: 0.59s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank43.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.13s
|
| 12 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.69 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.360 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32069.0 tgs: 58 data_time: 2.01s time: 547.56s eta: 3 days, 18:11:41
|
| 258 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32159.0 tgs: 61 data_time: 0.85s time: 523.24s eta: 3 days, 14:02:36
|
| 260 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31619.0 tgs: 60 data_time: 1.00s time: 522.89s eta: 3 days, 13:50:25
|
| 262 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32056.0 tgs: 61 data_time: 0.92s time: 520.29s eta: 3 days, 13:16:09
|
| 264 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31273.0 tgs: 60 data_time: 1.02s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.318 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.8GB text_tokens: 31327.0 tgs: 60 data_time: 0.93s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32396.0 tgs: 61 data_time: 0.82s time: 523.38s eta: 3 days, 13:20:23
|
| 270 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.9GB text_tokens: 31929.0 tgs: 61 data_time: 0.82s time: 520.66s eta: 3 days, 12:45:04
|
| 272 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31933.0 tgs: 61 data_time: 0.83s time: 520.17s eta: 3 days, 12:31:40
|
| 274 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 31433.0 tgs: 60 data_time: 0.83s time: 520.42s eta: 3 days, 12:25:26
|
| 276 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32287.0 tgs: 61 data_time: 1.05s time: 524.53s eta: 3 days, 12:56:39
|
| 278 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31446.0 tgs: 60 data_time: 0.75s time: 520.64s eta: 3 days, 12:10:14
|
| 280 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 31841.0 tgs: 61 data_time: 0.74s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31057.0 tgs: 59 data_time: 0.73s time: 521.28s eta: 3 days, 11:59:02
|
| 284 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32202.0 tgs: 61 data_time: 0.90s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32492.0 tgs: 62 data_time: 0.65s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 31095.0 tgs: 59 data_time: 0.64s time: 518.82s eta: 3 days, 11:09:16
|
| 290 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32615.0 tgs: 62 data_time: 0.69s time: 522.10s eta: 3 days, 11:32:08
|
| 292 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.64s time: 523.82s eta: 3 days, 11:39:58
|
| 294 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 32340.0 tgs: 62 data_time: 0.61s time: 520.48s eta: 3 days, 10:59:17
|
| 296 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.342 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31148.0 tgs: 60 data_time: 0.77s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 30975.0 tgs: 59 data_time: 0.64s time: 522.83s eta: 3 days, 11:04:20
|
| 300 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32237.0 tgs: 61 data_time: 0.86s time: 523.53s eta: 3 days, 11:02:17
|
| 302 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31482.0 tgs: 60 data_time: 0.87s time: 520.96s eta: 3 days, 10:29:06
|
| 304 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.0GB text_tokens: 31615.0 tgs: 60 data_time: 0.69s time: 519.10s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32372.0 tgs: 61 data_time: 0.95s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 27656.0 tgs: 52 data_time: 0.69s time: 522.96s eta: 3 days, 10:21:55
|
| 310 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31754.0 tgs: 60 data_time: 0.56s time: 521.31s eta: 3 days, 9:57:42
|
| 312 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.339 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32225.0 tgs: 61 data_time: 1.03s time: 520.63s eta: 3 days, 9:42:37
|
| 314 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32178.0 tgs: 61 data_time: 0.88s time: 522.41s eta: 3 days, 9:50:41
|
| 316 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 32356.0 tgs: 61 data_time: 0.95s time: 523.75s eta: 3 days, 9:54:30
|
| 318 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32084.0 tgs: 61 data_time: 0.55s time: 520.42s eta: 3 days, 9:14:34
|
| 320 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.9GB text_tokens: 31489.0 tgs: 60 data_time: 0.82s time: 520.67s eta: 3 days, 9:08:17
|
| 322 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.341 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31760.0 tgs: 60 data_time: 0.76s time: 521.29s eta: 3 days, 9:05:25
|
| 324 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31719.0 tgs: 60 data_time: 0.93s time: 524.10s eta: 3 days, 9:22:54
|
| 326 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31694.0 tgs: 60 data_time: 0.92s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31260.0 tgs: 60 data_time: 0.58s time: 520.19s eta: 3 days, 8:29:03
|
| 330 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.375 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 32424.0 tgs: 62 data_time: 1.24s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 31747.0 tgs: 60 data_time: 0.71s time: 524.33s eta: 3 days, 8:50:00
|
| 334 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 31806.0 tgs: 61 data_time: 0.89s time: 519.88s eta: 3 days, 8:00:12
|
| 336 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.9GB text_tokens: 31095.0 tgs: 59 data_time: 0.82s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31720.0 tgs: 60 data_time: 0.98s time: 521.99s eta: 3 days, 8:02:19
|
| 340 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 31981.0 tgs: 60 data_time: 1.10s time: 524.35s eta: 3 days, 8:15:14
|
| 342 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31115.0 tgs: 59 data_time: 0.87s time: 520.52s eta: 3 days, 7:31:27
|
| 344 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.1GB text_tokens: 32109.0 tgs: 61 data_time: 0.82s time: 518.56s eta: 3 days, 7:04:48
|
| 346 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31354.0 tgs: 59 data_time: 0.76s time: 523.15s eta: 3 days, 7:38:05
|
| 348 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.0GB text_tokens: 31295.0 tgs: 59 data_time: 0.73s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 32080.0 tgs: 61 data_time: 0.79s time: 520.18s eta: 3 days, 6:53:40
|
| 352 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 0.62s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 32091.0 tgs: 61 data_time: 0.84s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.7GB text_tokens: 31199.0 tgs: 59 data_time: 0.97s time: 523.79s eta: 3 days, 7:00:18
|
| 358 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.9GB text_tokens: 31340.0 tgs: 60 data_time: 0.78s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 31916.0 tgs: 61 data_time: 0.78s time: 518.77s eta: 3 days, 5:57:34
|
| 362 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 31919.0 tgs: 61 data_time: 0.64s time: 522.68s eta: 3 days, 6:24:07
|
| 364 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.5GB text_tokens: 31153.0 tgs: 59 data_time: 0.82s time: 522.74s eta: 3 days, 6:15:59
|
| 366 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.9GB text_tokens: 31392.0 tgs: 60 data_time: 0.92s time: 520.52s eta: 3 days, 5:47:21
|
| 368 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.8GB text_tokens: 31449.0 tgs: 60 data_time: 0.84s time: 519.65s eta: 3 days, 5:30:52
|
| 370 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32236.0 tgs: 61 data_time: 0.78s time: 521.31s eta: 3 days, 5:36:59
|
| 372 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.7GB text_tokens: 31580.0 tgs: 60 data_time: 0.66s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31801.0 tgs: 61 data_time: 0.64s time: 519.32s eta: 3 days, 5:01:54
|
| 376 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31747.0 tgs: 60 data_time: 0.80s time: 520.66s eta: 3 days, 5:05:13
|
| 378 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31667.0 tgs: 60 data_time: 1.02s time: 521.45s eta: 3 days, 5:03:30
|
| 380 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31543.0 tgs: 60 data_time: 0.65s time: 523.94s eta: 3 days, 5:16:50
|
| 382 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 32204.0 tgs: 62 data_time: 0.67s time: 518.85s eta: 3 days, 4:23:08
|
| 384 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 32.7GB text_tokens: 30805.0 tgs: 59 data_time: 0.76s time: 519.27s eta: 3 days, 4:18:11
|
| 386 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 31921.0 tgs: 61 data_time: 0.95s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 32335.0 tgs: 61 data_time: 0.78s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32297.0 tgs: 62 data_time: 0.86s time: 520.31s eta: 3 days, 4:01:24
|
| 392 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31866.0 tgs: 61 data_time: 0.77s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 43][DP 10][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32561.0 tgs: 62 data_time: 0.62s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank44.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.15s
|
| 12 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.80 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.359 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.0GB text_tokens: 31361.0 tgs: 57 data_time: 1.70s time: 547.71s eta: 3 days, 18:13:14
|
| 258 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32373.0 tgs: 61 data_time: 1.12s time: 523.24s eta: 3 days, 14:02:35
|
| 260 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.221 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31271.0 tgs: 59 data_time: 1.12s time: 522.89s eta: 3 days, 13:50:26
|
| 262 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31959.0 tgs: 61 data_time: 0.77s time: 520.29s eta: 3 days, 13:16:08
|
| 264 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 32119.0 tgs: 61 data_time: 0.60s time: 520.98s eta: 3 days, 13:14:16
|
| 266 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.8GB text_tokens: 31094.0 tgs: 59 data_time: 0.83s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31860.0 tgs: 60 data_time: 1.02s time: 523.38s eta: 3 days, 13:20:23
|
| 270 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.380 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.6GB text_tokens: 31574.0 tgs: 60 data_time: 0.73s time: 520.66s eta: 3 days, 12:45:05
|
| 272 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.207 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31714.0 tgs: 60 data_time: 0.74s time: 520.17s eta: 3 days, 12:31:40
|
| 274 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 31891.0 tgs: 61 data_time: 0.80s time: 520.42s eta: 3 days, 12:25:26
|
| 276 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.350 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31327.0 tgs: 59 data_time: 0.87s time: 524.53s eta: 3 days, 12:56:39
|
| 278 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 32020.0 tgs: 61 data_time: 0.82s time: 520.64s eta: 3 days, 12:10:15
|
| 280 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31061.0 tgs: 59 data_time: 0.69s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 30891.0 tgs: 59 data_time: 0.76s time: 521.28s eta: 3 days, 11:59:04
|
| 284 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 31341.0 tgs: 59 data_time: 0.80s time: 524.12s eta: 3 days, 12:17:45
|
| 286 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.9GB text_tokens: 31438.0 tgs: 60 data_time: 0.91s time: 520.53s eta: 3 days, 11:34:28
|
| 288 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.0GB text_tokens: 32067.0 tgs: 61 data_time: 0.67s time: 518.82s eta: 3 days, 11:09:16
|
| 290 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.96s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31293.0 tgs: 59 data_time: 0.78s time: 523.82s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31896.0 tgs: 61 data_time: 0.74s time: 520.49s eta: 3 days, 10:59:18
|
| 296 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31834.0 tgs: 61 data_time: 0.68s time: 518.43s eta: 3 days, 10:30:58
|
| 298 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 32278.0 tgs: 61 data_time: 0.75s time: 522.84s eta: 3 days, 11:04:21
|
| 300 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31172.0 tgs: 59 data_time: 0.95s time: 523.54s eta: 3 days, 11:02:18
|
| 302 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32324.0 tgs: 62 data_time: 0.96s time: 520.96s eta: 3 days, 10:29:06
|
| 304 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 32419.0 tgs: 62 data_time: 0.90s time: 519.10s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.7GB text_tokens: 31182.0 tgs: 59 data_time: 0.84s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31901.0 tgs: 61 data_time: 0.70s time: 522.96s eta: 3 days, 10:21:57
|
| 310 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 30544.0 tgs: 58 data_time: 0.97s time: 521.31s eta: 3 days, 9:57:41
|
| 312 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 31830.0 tgs: 61 data_time: 0.76s time: 520.63s eta: 3 days, 9:42:37
|
| 314 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 31792.0 tgs: 60 data_time: 1.11s time: 522.41s eta: 3 days, 9:50:42
|
| 316 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 31968.0 tgs: 61 data_time: 0.81s time: 523.75s eta: 3 days, 9:54:30
|
| 318 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31133.0 tgs: 59 data_time: 0.85s time: 520.42s eta: 3 days, 9:14:33
|
| 320 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 32219.0 tgs: 61 data_time: 0.83s time: 520.67s eta: 3 days, 9:08:17
|
| 322 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31570.0 tgs: 60 data_time: 0.93s time: 521.30s eta: 3 days, 9:05:25
|
| 324 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32328.0 tgs: 61 data_time: 0.99s time: 524.11s eta: 3 days, 9:22:55
|
| 326 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31564.0 tgs: 60 data_time: 0.95s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31631.0 tgs: 60 data_time: 0.99s time: 520.19s eta: 3 days, 8:29:04
|
| 330 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 31761.0 tgs: 60 data_time: 0.76s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 0.86s time: 524.33s eta: 3 days, 8:50:00
|
| 334 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.215 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 30209.0 tgs: 58 data_time: 0.75s time: 519.88s eta: 3 days, 8:00:13
|
| 336 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.338 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32351.0 tgs: 62 data_time: 0.63s time: 520.11s eta: 3 days, 7:53:42
|
| 338 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.8GB text_tokens: 31789.0 tgs: 60 data_time: 0.74s time: 521.99s eta: 3 days, 8:02:20
|
| 340 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 32049.0 tgs: 61 data_time: 0.72s time: 524.35s eta: 3 days, 8:15:14
|
| 342 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32186.0 tgs: 61 data_time: 0.79s time: 520.52s eta: 3 days, 7:31:28
|
| 344 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31935.0 tgs: 61 data_time: 0.85s time: 518.56s eta: 3 days, 7:04:49
|
| 346 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32419.0 tgs: 61 data_time: 0.78s time: 523.15s eta: 3 days, 7:38:05
|
| 348 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.9GB text_tokens: 32027.0 tgs: 61 data_time: 0.86s time: 523.62s eta: 3 days, 7:33:39
|
| 350 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 31765.0 tgs: 61 data_time: 0.59s time: 520.19s eta: 3 days, 6:53:41
|
| 352 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.5GB text_tokens: 31151.0 tgs: 59 data_time: 0.82s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 32470.0 tgs: 61 data_time: 0.85s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 0.89s time: 523.79s eta: 3 days, 7:00:18
|
| 358 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32319.0 tgs: 62 data_time: 0.91s time: 520.23s eta: 3 days, 6:19:26
|
| 360 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.6GB text_tokens: 30805.0 tgs: 59 data_time: 0.85s time: 518.77s eta: 3 days, 5:57:34
|
| 362 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32262.0 tgs: 61 data_time: 0.75s time: 522.68s eta: 3 days, 6:24:07
|
| 364 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 32600.0 tgs: 62 data_time: 0.83s time: 522.75s eta: 3 days, 6:16:00
|
| 366 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 31887.0 tgs: 61 data_time: 0.81s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.0GB text_tokens: 32253.0 tgs: 62 data_time: 0.95s time: 519.65s eta: 3 days, 5:30:53
|
| 370 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 29875.0 tgs: 57 data_time: 0.78s time: 521.31s eta: 3 days, 5:37:00
|
| 372 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.6GB text_tokens: 31345.0 tgs: 59 data_time: 0.77s time: 523.58s eta: 3 days, 5:48:37
|
| 374 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 30572.0 tgs: 58 data_time: 0.93s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31052.0 tgs: 59 data_time: 0.88s time: 520.66s eta: 3 days, 5:05:13
|
| 378 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31497.0 tgs: 60 data_time: 0.74s time: 521.45s eta: 3 days, 5:03:30
|
| 380 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31868.0 tgs: 60 data_time: 0.82s time: 523.94s eta: 3 days, 5:16:52
|
| 382 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31269.0 tgs: 60 data_time: 0.78s time: 518.85s eta: 3 days, 4:23:08
|
| 384 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31590.0 tgs: 60 data_time: 0.70s time: 519.27s eta: 3 days, 4:18:11
|
| 386 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 32429.0 tgs: 62 data_time: 0.72s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 29122.0 tgs: 55 data_time: 0.86s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.9GB text_tokens: 31224.0 tgs: 60 data_time: 0.71s time: 520.32s eta: 3 days, 4:01:25
|
| 392 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.0GB text_tokens: 31914.0 tgs: 61 data_time: 1.10s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 44][DP 11][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.215 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31436.0 tgs: 60 data_time: 1.08s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank46.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.16s
|
| 12 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.80 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.0GB text_tokens: 31361.0 tgs: 57 data_time: 1.69s time: 547.71s eta: 3 days, 18:13:12
|
| 258 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 32373.0 tgs: 61 data_time: 1.11s time: 523.23s eta: 3 days, 14:02:34
|
| 260 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31271.0 tgs: 59 data_time: 1.16s time: 522.89s eta: 3 days, 13:50:25
|
| 262 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31959.0 tgs: 61 data_time: 0.77s time: 520.28s eta: 3 days, 13:16:07
|
| 264 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.1GB text_tokens: 32119.0 tgs: 61 data_time: 0.59s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.8GB text_tokens: 31094.0 tgs: 59 data_time: 0.77s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31860.0 tgs: 60 data_time: 1.00s time: 523.38s eta: 3 days, 13:20:24
|
| 270 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.344 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 32.6GB text_tokens: 31574.0 tgs: 60 data_time: 0.72s time: 520.66s eta: 3 days, 12:45:05
|
| 272 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31714.0 tgs: 60 data_time: 0.76s time: 520.17s eta: 3 days, 12:31:41
|
| 274 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 31891.0 tgs: 61 data_time: 0.79s time: 520.42s eta: 3 days, 12:25:26
|
| 276 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31327.0 tgs: 59 data_time: 0.86s time: 524.53s eta: 3 days, 12:56:38
|
| 278 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.210 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 32020.0 tgs: 61 data_time: 0.81s time: 520.64s eta: 3 days, 12:10:15
|
| 280 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31061.0 tgs: 59 data_time: 0.67s time: 519.96s eta: 3 days, 11:54:58
|
| 282 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 30891.0 tgs: 59 data_time: 0.70s time: 521.28s eta: 3 days, 11:59:03
|
| 284 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 31341.0 tgs: 59 data_time: 0.77s time: 524.12s eta: 3 days, 12:17:44
|
| 286 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.9GB text_tokens: 31438.0 tgs: 60 data_time: 0.83s time: 520.53s eta: 3 days, 11:34:27
|
| 288 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.0GB text_tokens: 32067.0 tgs: 61 data_time: 0.65s time: 518.82s eta: 3 days, 11:09:18
|
| 290 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.97s time: 522.10s eta: 3 days, 11:32:09
|
| 292 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.9GB text_tokens: 31293.0 tgs: 59 data_time: 0.76s time: 523.82s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31896.0 tgs: 61 data_time: 0.70s time: 520.48s eta: 3 days, 10:59:17
|
| 296 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31834.0 tgs: 61 data_time: 0.67s time: 518.42s eta: 3 days, 10:30:57
|
| 298 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 32278.0 tgs: 61 data_time: 0.71s time: 522.83s eta: 3 days, 11:04:20
|
| 300 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31172.0 tgs: 59 data_time: 0.89s time: 523.53s eta: 3 days, 11:02:18
|
| 302 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32324.0 tgs: 62 data_time: 0.89s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 32419.0 tgs: 62 data_time: 0.85s time: 519.10s eta: 3 days, 10:02:45
|
| 306 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.351 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.7GB text_tokens: 31182.0 tgs: 59 data_time: 0.80s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 31901.0 tgs: 61 data_time: 0.67s time: 522.96s eta: 3 days, 10:21:57
|
| 310 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.316 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 30544.0 tgs: 58 data_time: 0.95s time: 521.31s eta: 3 days, 9:57:42
|
| 312 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 31830.0 tgs: 61 data_time: 0.73s time: 520.63s eta: 3 days, 9:42:37
|
| 314 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.317 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 31792.0 tgs: 60 data_time: 1.12s time: 522.42s eta: 3 days, 9:50:42
|
| 316 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 31968.0 tgs: 61 data_time: 0.81s time: 523.75s eta: 3 days, 9:54:29
|
| 318 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31133.0 tgs: 59 data_time: 0.80s time: 520.42s eta: 3 days, 9:14:36
|
| 320 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 32219.0 tgs: 61 data_time: 0.87s time: 520.67s eta: 3 days, 9:08:16
|
| 322 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31570.0 tgs: 60 data_time: 0.92s time: 521.30s eta: 3 days, 9:05:27
|
| 324 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32328.0 tgs: 61 data_time: 1.02s time: 524.10s eta: 3 days, 9:22:54
|
| 326 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31564.0 tgs: 60 data_time: 0.96s time: 520.95s eta: 3 days, 8:44:51
|
| 328 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31631.0 tgs: 60 data_time: 0.93s time: 520.19s eta: 3 days, 8:29:04
|
| 330 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.1GB text_tokens: 31761.0 tgs: 60 data_time: 0.75s time: 522.56s eta: 3 days, 8:42:22
|
| 332 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 0.87s time: 524.33s eta: 3 days, 8:50:01
|
| 334 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 30209.0 tgs: 58 data_time: 0.76s time: 519.88s eta: 3 days, 8:00:13
|
| 336 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32351.0 tgs: 62 data_time: 0.65s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.354 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.8GB text_tokens: 31789.0 tgs: 60 data_time: 0.70s time: 521.99s eta: 3 days, 8:02:19
|
| 340 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 32049.0 tgs: 61 data_time: 0.73s time: 524.35s eta: 3 days, 8:15:16
|
| 342 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32186.0 tgs: 61 data_time: 0.80s time: 520.52s eta: 3 days, 7:31:27
|
| 344 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31935.0 tgs: 61 data_time: 0.81s time: 518.56s eta: 3 days, 7:04:49
|
| 346 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32419.0 tgs: 61 data_time: 0.76s time: 523.15s eta: 3 days, 7:38:05
|
| 348 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.9GB text_tokens: 32027.0 tgs: 61 data_time: 0.84s time: 523.62s eta: 3 days, 7:33:38
|
| 350 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 33.1GB text_tokens: 31765.0 tgs: 61 data_time: 0.58s time: 520.19s eta: 3 days, 6:53:41
|
| 352 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.5GB text_tokens: 31151.0 tgs: 59 data_time: 0.79s time: 520.05s eta: 3 days, 6:43:46
|
| 354 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 32470.0 tgs: 61 data_time: 0.80s time: 523.75s eta: 3 days, 7:08:41
|
| 356 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 0.86s time: 523.79s eta: 3 days, 7:00:17
|
| 358 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32319.0 tgs: 62 data_time: 0.88s time: 520.23s eta: 3 days, 6:19:27
|
| 360 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.6GB text_tokens: 30805.0 tgs: 59 data_time: 0.83s time: 518.77s eta: 3 days, 5:57:35
|
| 362 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32262.0 tgs: 61 data_time: 0.73s time: 522.68s eta: 3 days, 6:24:07
|
| 364 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 32600.0 tgs: 62 data_time: 0.80s time: 522.75s eta: 3 days, 6:16:00
|
| 366 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.8GB text_tokens: 31887.0 tgs: 61 data_time: 0.79s time: 520.52s eta: 3 days, 5:47:20
|
| 368 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.0GB text_tokens: 32253.0 tgs: 62 data_time: 0.94s time: 519.65s eta: 3 days, 5:30:53
|
| 370 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 29875.0 tgs: 57 data_time: 0.76s time: 521.31s eta: 3 days, 5:37:00
|
| 372 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.6GB text_tokens: 31345.0 tgs: 59 data_time: 0.76s time: 523.59s eta: 3 days, 5:48:38
|
| 374 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 30572.0 tgs: 58 data_time: 0.90s time: 519.32s eta: 3 days, 5:01:55
|
| 376 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31052.0 tgs: 59 data_time: 0.86s time: 520.66s eta: 3 days, 5:05:13
|
| 378 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31497.0 tgs: 60 data_time: 0.72s time: 521.45s eta: 3 days, 5:03:29
|
| 380 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31868.0 tgs: 60 data_time: 0.78s time: 523.94s eta: 3 days, 5:16:52
|
| 382 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31269.0 tgs: 60 data_time: 0.81s time: 518.85s eta: 3 days, 4:23:08
|
| 384 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31590.0 tgs: 60 data_time: 0.73s time: 519.23s eta: 3 days, 4:17:51
|
| 386 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 32429.0 tgs: 62 data_time: 0.79s time: 520.71s eta: 3 days, 4:22:14
|
| 388 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 29122.0 tgs: 55 data_time: 0.86s time: 524.25s eta: 3 days, 4:44:38
|
| 390 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.9GB text_tokens: 31224.0 tgs: 60 data_time: 0.72s time: 520.32s eta: 3 days, 4:01:25
|
| 392 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.0GB text_tokens: 31914.0 tgs: 61 data_time: 1.07s time: 519.49s eta: 3 days, 3:45:32
|
| 394 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 46][DP 11][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31436.0 tgs: 60 data_time: 1.04s time: 522.31s eta: 3 days, 4:01:31
|
20250120235238/rank49.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.11s
|
| 12 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.38 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31357.0 tgs: 57 data_time: 1.95s time: 548.09s eta: 3 days, 18:16:54
|
| 258 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.0GB text_tokens: 32248.0 tgs: 61 data_time: 0.88s time: 523.23s eta: 3 days, 14:02:30
|
| 260 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 31853.0 tgs: 60 data_time: 1.03s time: 522.87s eta: 3 days, 13:50:18
|
| 262 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.330 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 32083.0 tgs: 61 data_time: 0.88s time: 520.30s eta: 3 days, 13:16:16
|
| 264 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 32125.0 tgs: 61 data_time: 0.98s time: 520.98s eta: 3 days, 13:14:15
|
| 266 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.9GB text_tokens: 31521.0 tgs: 60 data_time: 0.79s time: 520.89s eta: 3 days, 13:04:45
|
| 268 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 31512.0 tgs: 60 data_time: 0.89s time: 523.39s eta: 3 days, 13:20:27
|
| 270 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31822.0 tgs: 61 data_time: 0.99s time: 520.66s eta: 3 days, 12:45:05
|
| 272 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.341 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31908.0 tgs: 61 data_time: 0.80s time: 520.18s eta: 3 days, 12:31:47
|
| 274 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.352 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32433.0 tgs: 62 data_time: 0.83s time: 520.39s eta: 3 days, 12:25:10
|
| 276 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 32333.0 tgs: 61 data_time: 0.77s time: 524.54s eta: 3 days, 12:56:44
|
| 278 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.201 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 30295.0 tgs: 58 data_time: 0.98s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.331 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 31362.0 tgs: 60 data_time: 0.81s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32181.0 tgs: 61 data_time: 0.89s time: 521.29s eta: 3 days, 11:59:07
|
| 284 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31770.0 tgs: 60 data_time: 1.07s time: 524.13s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.326 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32107.0 tgs: 61 data_time: 1.05s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.0GB text_tokens: 31655.0 tgs: 61 data_time: 0.69s time: 518.79s eta: 3 days, 11:09:03
|
| 290 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 32.9GB text_tokens: 31529.0 tgs: 60 data_time: 0.90s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.1GB text_tokens: 31277.0 tgs: 59 data_time: 0.77s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.1GB text_tokens: 31388.0 tgs: 60 data_time: 0.63s time: 520.47s eta: 3 days, 10:59:11
|
| 296 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 32144.0 tgs: 62 data_time: 0.73s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.8GB text_tokens: 31305.0 tgs: 59 data_time: 0.70s time: 522.84s eta: 3 days, 11:04:23
|
| 300 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32167.0 tgs: 61 data_time: 0.89s time: 523.52s eta: 3 days, 11:02:11
|
| 302 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32301.0 tgs: 62 data_time: 0.79s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.206 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31599.0 tgs: 60 data_time: 0.70s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 32431.0 tgs: 61 data_time: 0.74s time: 523.30s eta: 3 days, 10:33:52
|
| 308 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.0GB text_tokens: 31812.0 tgs: 60 data_time: 0.85s time: 522.95s eta: 3 days, 10:21:49
|
| 310 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.0GB text_tokens: 30722.0 tgs: 58 data_time: 0.63s time: 521.31s eta: 3 days, 9:57:42
|
| 312 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.0GB text_tokens: 29912.0 tgs: 57 data_time: 0.86s time: 520.64s eta: 3 days, 9:42:40
|
| 314 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31300.0 tgs: 59 data_time: 1.03s time: 522.40s eta: 3 days, 9:50:31
|
| 316 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.1GB text_tokens: 32379.0 tgs: 61 data_time: 1.06s time: 523.75s eta: 3 days, 9:54:31
|
| 318 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.8GB text_tokens: 31647.0 tgs: 60 data_time: 0.71s time: 520.43s eta: 3 days, 9:14:42
|
| 320 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.7GB text_tokens: 31134.0 tgs: 59 data_time: 0.81s time: 520.67s eta: 3 days, 9:08:17
|
| 322 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.343 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31309.0 tgs: 60 data_time: 0.66s time: 521.29s eta: 3 days, 9:05:22
|
| 324 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.9GB text_tokens: 31520.0 tgs: 60 data_time: 0.62s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.329 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 32247.0 tgs: 61 data_time: 0.86s time: 520.96s eta: 3 days, 8:44:55
|
| 328 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31693.0 tgs: 60 data_time: 0.92s time: 520.17s eta: 3 days, 8:28:54
|
| 330 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.213 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 32127.0 tgs: 61 data_time: 0.94s time: 522.56s eta: 3 days, 8:42:26
|
| 332 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.0GB text_tokens: 30901.0 tgs: 58 data_time: 0.66s time: 524.33s eta: 3 days, 8:50:02
|
| 334 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31581.0 tgs: 60 data_time: 0.86s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31785.0 tgs: 61 data_time: 0.79s time: 520.10s eta: 3 days, 7:53:33
|
| 338 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.8GB text_tokens: 31496.0 tgs: 60 data_time: 0.88s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32186.0 tgs: 61 data_time: 0.73s time: 524.36s eta: 3 days, 8:15:20
|
| 342 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31986.0 tgs: 61 data_time: 0.79s time: 520.51s eta: 3 days, 7:31:18
|
| 344 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31430.0 tgs: 60 data_time: 0.84s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31889.0 tgs: 60 data_time: 0.74s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.7GB text_tokens: 31343.0 tgs: 59 data_time: 0.94s time: 523.63s eta: 3 days, 7:33:45
|
| 350 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31765.0 tgs: 61 data_time: 0.79s time: 520.16s eta: 3 days, 6:53:27
|
| 352 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 31720.0 tgs: 60 data_time: 1.02s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.210 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31630.0 tgs: 60 data_time: 1.15s time: 523.75s eta: 3 days, 7:08:42
|
| 356 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31547.0 tgs: 60 data_time: 0.87s time: 523.80s eta: 3 days, 7:00:21
|
| 358 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 31899.0 tgs: 61 data_time: 0.85s time: 520.24s eta: 3 days, 6:19:31
|
| 360 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.7GB text_tokens: 30536.0 tgs: 58 data_time: 0.85s time: 518.77s eta: 3 days, 5:57:36
|
| 362 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31859.0 tgs: 60 data_time: 0.75s time: 522.66s eta: 3 days, 6:23:59
|
| 364 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.8GB text_tokens: 31761.0 tgs: 60 data_time: 0.80s time: 522.72s eta: 3 days, 6:15:46
|
| 366 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.359 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31082.0 tgs: 59 data_time: 0.82s time: 520.53s eta: 3 days, 5:47:23
|
| 368 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.9GB text_tokens: 32012.0 tgs: 61 data_time: 0.78s time: 519.66s eta: 3 days, 5:30:56
|
| 370 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.8GB text_tokens: 31850.0 tgs: 61 data_time: 0.93s time: 521.30s eta: 3 days, 5:36:58
|
| 372 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 30425.0 tgs: 58 data_time: 0.98s time: 523.59s eta: 3 days, 5:48:42
|
| 374 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.217 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31774.0 tgs: 61 data_time: 0.59s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 32.9GB text_tokens: 30693.0 tgs: 58 data_time: 0.85s time: 520.64s eta: 3 days, 5:05:02
|
| 378 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.1GB text_tokens: 31736.0 tgs: 60 data_time: 0.95s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.1GB text_tokens: 30661.0 tgs: 58 data_time: 0.68s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 32237.0 tgs: 62 data_time: 1.02s time: 518.85s eta: 3 days, 4:23:11
|
| 384 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31226.0 tgs: 60 data_time: 1.11s time: 519.24s eta: 3 days, 4:17:59
|
| 386 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31462.0 tgs: 60 data_time: 0.82s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.9GB text_tokens: 31003.0 tgs: 59 data_time: 0.89s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.0GB text_tokens: 30989.0 tgs: 59 data_time: 0.59s time: 520.30s eta: 3 days, 4:01:19
|
| 392 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31722.0 tgs: 61 data_time: 0.71s time: 519.50s eta: 3 days, 3:45:36
|
| 394 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 49][DP 12][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 32510.0 tgs: 62 data_time: 0.83s time: 522.32s eta: 3 days, 4:01:33
|
20250120235238/rank5.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.21s
|
| 12 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 145.61 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.6GB text_tokens: 31121.0 tgs: 56 data_time: 2.26s time: 546.73s eta: 3 days, 18:03:30
|
| 258 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31200.0 tgs: 59 data_time: 0.85s time: 523.22s eta: 3 days, 14:02:25
|
| 260 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 30861.0 tgs: 59 data_time: 0.93s time: 522.94s eta: 3 days, 13:50:56
|
| 262 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.95s time: 520.28s eta: 3 days, 13:16:02
|
| 264 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.15s time: 520.97s eta: 3 days, 13:14:09
|
| 266 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.222 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31899.0 tgs: 61 data_time: 0.76s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32173.0 tgs: 61 data_time: 1.01s time: 523.37s eta: 3 days, 13:20:16
|
| 270 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32073.0 tgs: 61 data_time: 0.75s time: 520.65s eta: 3 days, 12:44:58
|
| 272 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.338 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 0.81s time: 520.16s eta: 3 days, 12:31:35
|
| 274 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32211.0 tgs: 61 data_time: 0.69s time: 520.48s eta: 3 days, 12:25:57
|
| 276 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31612.0 tgs: 60 data_time: 0.90s time: 524.52s eta: 3 days, 12:56:33
|
| 278 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31664.0 tgs: 60 data_time: 0.66s time: 520.63s eta: 3 days, 12:10:08
|
| 280 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31279.0 tgs: 60 data_time: 0.67s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31574.0 tgs: 60 data_time: 0.85s time: 521.28s eta: 3 days, 11:58:59
|
| 284 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 31608.0 tgs: 60 data_time: 0.71s time: 524.11s eta: 3 days, 12:17:36
|
| 286 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32396.0 tgs: 62 data_time: 0.84s time: 520.52s eta: 3 days, 11:34:22
|
| 288 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32445.0 tgs: 62 data_time: 0.77s time: 518.89s eta: 3 days, 11:09:59
|
| 290 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32483.0 tgs: 62 data_time: 0.85s time: 522.09s eta: 3 days, 11:32:04
|
| 292 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.8GB text_tokens: 30947.0 tgs: 59 data_time: 0.88s time: 523.81s eta: 3 days, 11:39:52
|
| 294 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32390.0 tgs: 62 data_time: 0.77s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31866.0 tgs: 61 data_time: 0.86s time: 518.42s eta: 3 days, 10:30:52
|
| 298 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 31128.0 tgs: 59 data_time: 0.82s time: 522.82s eta: 3 days, 11:04:15
|
| 300 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32297.0 tgs: 61 data_time: 0.67s time: 523.52s eta: 3 days, 11:02:10
|
| 302 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32495.0 tgs: 62 data_time: 1.01s time: 521.01s eta: 3 days, 10:29:36
|
| 304 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31533.0 tgs: 60 data_time: 0.71s time: 519.09s eta: 3 days, 10:02:39
|
| 306 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.4GB text_tokens: 31074.0 tgs: 59 data_time: 0.73s time: 523.28s eta: 3 days, 10:33:40
|
| 308 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.319 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32081.0 tgs: 61 data_time: 0.84s time: 522.97s eta: 3 days, 10:22:02
|
| 310 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31457.0 tgs: 60 data_time: 0.78s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31099.0 tgs: 59 data_time: 0.71s time: 520.62s eta: 3 days, 9:42:30
|
| 314 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32202.0 tgs: 61 data_time: 0.74s time: 522.42s eta: 3 days, 9:50:43
|
| 316 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 30996.0 tgs: 59 data_time: 0.69s time: 523.77s eta: 3 days, 9:54:43
|
| 318 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31401.0 tgs: 60 data_time: 0.93s time: 520.41s eta: 3 days, 9:14:30
|
| 320 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.7GB text_tokens: 31506.0 tgs: 60 data_time: 0.86s time: 520.66s eta: 3 days, 9:08:11
|
| 322 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31849.0 tgs: 61 data_time: 0.80s time: 521.32s eta: 3 days, 9:05:39
|
| 324 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31503.0 tgs: 60 data_time: 0.63s time: 524.09s eta: 3 days, 9:22:48
|
| 326 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32241.0 tgs: 61 data_time: 0.76s time: 520.94s eta: 3 days, 8:44:46
|
| 328 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.83s time: 520.21s eta: 3 days, 8:29:18
|
| 330 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 29886.0 tgs: 57 data_time: 0.93s time: 522.55s eta: 3 days, 8:42:19
|
| 332 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.74s time: 524.31s eta: 3 days, 8:49:54
|
| 334 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31388.0 tgs: 60 data_time: 0.85s time: 519.87s eta: 3 days, 8:00:07
|
| 336 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.322 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31748.0 tgs: 61 data_time: 0.79s time: 520.12s eta: 3 days, 7:53:46
|
| 338 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.354 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 31342.0 tgs: 60 data_time: 0.67s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 31840.0 tgs: 60 data_time: 0.84s time: 524.34s eta: 3 days, 8:15:09
|
| 342 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31789.0 tgs: 61 data_time: 0.64s time: 520.56s eta: 3 days, 7:31:46
|
| 344 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31941.0 tgs: 61 data_time: 0.78s time: 518.55s eta: 3 days, 7:04:43
|
| 346 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.2GB text_tokens: 30569.0 tgs: 58 data_time: 0.76s time: 523.14s eta: 3 days, 7:37:58
|
| 348 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.82s time: 523.61s eta: 3 days, 7:33:33
|
| 350 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.6GB text_tokens: 31568.0 tgs: 60 data_time: 0.75s time: 520.20s eta: 3 days, 6:53:51
|
| 352 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31707.0 tgs: 60 data_time: 1.06s time: 520.04s eta: 3 days, 6:43:40
|
| 354 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31866.0 tgs: 60 data_time: 0.86s time: 523.74s eta: 3 days, 7:08:35
|
| 356 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31584.0 tgs: 60 data_time: 0.88s time: 523.83s eta: 3 days, 7:00:37
|
| 358 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31870.0 tgs: 61 data_time: 0.95s time: 520.22s eta: 3 days, 6:19:20
|
| 360 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.8GB text_tokens: 30551.0 tgs: 58 data_time: 0.59s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31961.0 tgs: 61 data_time: 0.79s time: 522.67s eta: 3 days, 6:24:01
|
| 364 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 0.98s time: 522.77s eta: 3 days, 6:16:11
|
| 366 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 1.03s time: 520.51s eta: 3 days, 5:47:14
|
| 368 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31804.0 tgs: 61 data_time: 0.69s time: 519.64s eta: 3 days, 5:30:47
|
| 370 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32278.0 tgs: 61 data_time: 0.95s time: 521.34s eta: 3 days, 5:37:16
|
| 372 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31927.0 tgs: 60 data_time: 0.79s time: 523.57s eta: 3 days, 5:48:31
|
| 374 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31369.0 tgs: 60 data_time: 0.93s time: 519.29s eta: 3 days, 5:01:42
|
| 376 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32214.0 tgs: 61 data_time: 0.80s time: 520.67s eta: 3 days, 5:05:14
|
| 378 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31816.0 tgs: 61 data_time: 0.89s time: 521.47s eta: 3 days, 5:03:41
|
| 380 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31958.0 tgs: 60 data_time: 0.60s time: 523.93s eta: 3 days, 5:16:46
|
| 382 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 0.96s time: 518.84s eta: 3 days, 4:23:02
|
| 384 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31565.0 tgs: 60 data_time: 0.74s time: 519.27s eta: 3 days, 4:18:14
|
| 386 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31753.0 tgs: 60 data_time: 0.68s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.5GB text_tokens: 30827.0 tgs: 58 data_time: 0.72s time: 524.24s eta: 3 days, 4:44:33
|
| 390 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.8GB text_tokens: 31476.0 tgs: 60 data_time: 1.04s time: 520.34s eta: 3 days, 4:01:38
|
| 392 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31389.0 tgs: 60 data_time: 0.43s time: 519.52s eta: 3 days, 3:45:47
|
| 394 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 5][DP 1][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 32.4GB text_tokens: 30909.0 tgs: 59 data_time: 0.89s time: 522.30s eta: 3 days, 4:01:26
|
20250120235238/rank54.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.11s
|
| 12 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.36 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31804.0 tgs: 58 data_time: 1.90s time: 548.10s eta: 3 days, 18:17:03
|
| 258 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31938.0 tgs: 61 data_time: 0.70s time: 523.22s eta: 3 days, 14:02:26
|
| 260 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 31420.0 tgs: 60 data_time: 0.77s time: 522.87s eta: 3 days, 13:50:17
|
| 262 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 32320.0 tgs: 62 data_time: 0.93s time: 520.29s eta: 3 days, 13:16:12
|
| 264 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 31755.0 tgs: 60 data_time: 0.87s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 31116.0 tgs: 59 data_time: 0.95s time: 520.89s eta: 3 days, 13:04:45
|
| 268 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32297.0 tgs: 61 data_time: 0.83s time: 523.39s eta: 3 days, 13:20:27
|
| 270 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32099.0 tgs: 61 data_time: 0.87s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 31943.0 tgs: 61 data_time: 0.95s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32321.0 tgs: 62 data_time: 0.82s time: 520.40s eta: 3 days, 12:25:13
|
| 276 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31685.0 tgs: 60 data_time: 1.21s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 31570.0 tgs: 60 data_time: 1.07s time: 520.65s eta: 3 days, 12:10:18
|
| 280 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.1GB text_tokens: 32271.0 tgs: 62 data_time: 1.02s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 31826.0 tgs: 61 data_time: 0.85s time: 521.29s eta: 3 days, 11:59:08
|
| 284 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.0GB text_tokens: 31152.0 tgs: 59 data_time: 0.74s time: 524.13s eta: 3 days, 12:17:49
|
| 286 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31903.0 tgs: 61 data_time: 0.85s time: 520.54s eta: 3 days, 11:34:32
|
| 288 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 30941.0 tgs: 59 data_time: 0.67s time: 518.79s eta: 3 days, 11:09:03
|
| 290 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.185 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 29085.0 tgs: 55 data_time: 0.67s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.1GB text_tokens: 32477.0 tgs: 61 data_time: 1.11s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.9GB text_tokens: 31064.0 tgs: 59 data_time: 0.73s time: 520.47s eta: 3 days, 10:59:10
|
| 296 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.99s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31106.0 tgs: 59 data_time: 0.98s time: 522.84s eta: 3 days, 11:04:24
|
| 300 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31337.0 tgs: 59 data_time: 1.02s time: 523.52s eta: 3 days, 11:02:11
|
| 302 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 32.8GB text_tokens: 31611.0 tgs: 60 data_time: 0.98s time: 520.96s eta: 3 days, 10:29:08
|
| 304 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.204 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31573.0 tgs: 60 data_time: 0.69s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.1GB text_tokens: 31897.0 tgs: 60 data_time: 1.07s time: 523.29s eta: 3 days, 10:33:51
|
| 308 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32058.0 tgs: 61 data_time: 0.94s time: 522.95s eta: 3 days, 10:21:50
|
| 310 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31119.0 tgs: 59 data_time: 0.81s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.219 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.7GB text_tokens: 31356.0 tgs: 60 data_time: 0.57s time: 520.64s eta: 3 days, 9:42:40
|
| 314 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.1GB text_tokens: 32225.0 tgs: 61 data_time: 1.08s time: 522.39s eta: 3 days, 9:50:29
|
| 316 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31625.0 tgs: 60 data_time: 0.76s time: 523.75s eta: 3 days, 9:54:33
|
| 318 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 31942.0 tgs: 61 data_time: 0.86s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.318 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.8GB text_tokens: 32061.0 tgs: 61 data_time: 0.89s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.4GB text_tokens: 30275.0 tgs: 58 data_time: 0.96s time: 521.29s eta: 3 days, 9:05:22
|
| 324 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.198 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.7GB text_tokens: 31108.0 tgs: 59 data_time: 0.77s time: 524.11s eta: 3 days, 9:22:58
|
| 326 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 32.9GB text_tokens: 31957.0 tgs: 61 data_time: 0.78s time: 520.96s eta: 3 days, 8:44:55
|
| 328 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 32.5GB text_tokens: 29801.0 tgs: 57 data_time: 0.97s time: 520.17s eta: 3 days, 8:28:53
|
| 330 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 31102.0 tgs: 59 data_time: 0.87s time: 522.56s eta: 3 days, 8:42:25
|
| 332 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32173.0 tgs: 61 data_time: 0.80s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 31492.0 tgs: 60 data_time: 0.92s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.0GB text_tokens: 31539.0 tgs: 60 data_time: 0.90s time: 520.10s eta: 3 days, 7:53:33
|
| 338 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 32436.0 tgs: 62 data_time: 1.20s time: 522.00s eta: 3 days, 8:02:24
|
| 340 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.345 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32092.0 tgs: 61 data_time: 0.72s time: 524.35s eta: 3 days, 8:15:18
|
| 342 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.0GB text_tokens: 31585.0 tgs: 60 data_time: 0.87s time: 520.51s eta: 3 days, 7:31:21
|
| 344 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.1GB text_tokens: 30729.0 tgs: 59 data_time: 0.47s time: 518.57s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31914.0 tgs: 61 data_time: 0.76s time: 523.15s eta: 3 days, 7:38:08
|
| 348 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.9GB text_tokens: 31429.0 tgs: 60 data_time: 0.96s time: 523.63s eta: 3 days, 7:33:43
|
| 350 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.7GB text_tokens: 31767.0 tgs: 61 data_time: 0.96s time: 520.16s eta: 3 days, 6:53:28
|
| 352 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31842.0 tgs: 61 data_time: 0.87s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.385 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31119.0 tgs: 59 data_time: 0.89s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 32.7GB text_tokens: 31541.0 tgs: 60 data_time: 1.25s time: 523.80s eta: 3 days, 7:00:21
|
| 358 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 32.8GB text_tokens: 31567.0 tgs: 60 data_time: 0.79s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.0GB text_tokens: 32276.0 tgs: 62 data_time: 1.00s time: 518.78s eta: 3 days, 5:57:38
|
| 362 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31104.0 tgs: 59 data_time: 0.71s time: 522.67s eta: 3 days, 6:23:59
|
| 364 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 32112.0 tgs: 61 data_time: 0.78s time: 522.72s eta: 3 days, 6:15:46
|
| 366 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.209 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 32.7GB text_tokens: 31195.0 tgs: 59 data_time: 0.77s time: 520.53s eta: 3 days, 5:47:23
|
| 368 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31401.0 tgs: 60 data_time: 0.66s time: 519.66s eta: 3 days, 5:30:57
|
| 370 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 31338.0 tgs: 60 data_time: 0.84s time: 521.31s eta: 3 days, 5:37:00
|
| 372 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 31585.0 tgs: 60 data_time: 0.80s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.6GB text_tokens: 31441.0 tgs: 60 data_time: 0.79s time: 519.32s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32230.0 tgs: 61 data_time: 1.05s time: 520.64s eta: 3 days, 5:05:03
|
| 378 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 32203.0 tgs: 61 data_time: 0.85s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.1GB text_tokens: 31194.0 tgs: 59 data_time: 0.83s time: 523.95s eta: 3 days, 5:16:56
|
| 382 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31045.0 tgs: 59 data_time: 0.76s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 32341.0 tgs: 62 data_time: 0.82s time: 519.25s eta: 3 days, 4:18:00
|
| 386 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.1GB text_tokens: 32008.0 tgs: 61 data_time: 0.78s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31320.0 tgs: 59 data_time: 1.24s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.215 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.8GB text_tokens: 31671.0 tgs: 60 data_time: 0.43s time: 520.30s eta: 3 days, 4:01:18
|
| 392 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31891.0 tgs: 61 data_time: 0.75s time: 519.50s eta: 3 days, 3:45:36
|
| 394 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 54][DP 13][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.373 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31509.0 tgs: 60 data_time: 0.93s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank56.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.36s
|
| 12 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 142.52 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32438.0 tgs: 59 data_time: 2.38s time: 546.75s eta: 3 days, 18:03:41
|
| 258 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31049.0 tgs: 59 data_time: 0.93s time: 523.24s eta: 3 days, 14:02:40
|
| 260 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.9GB text_tokens: 31400.0 tgs: 60 data_time: 1.00s time: 522.87s eta: 3 days, 13:50:15
|
| 262 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31528.0 tgs: 60 data_time: 0.78s time: 520.30s eta: 3 days, 13:16:16
|
| 264 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31808.0 tgs: 61 data_time: 0.96s time: 520.98s eta: 3 days, 13:14:19
|
| 266 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 32312.0 tgs: 62 data_time: 0.74s time: 520.91s eta: 3 days, 13:04:57
|
| 268 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 31349.0 tgs: 59 data_time: 0.73s time: 523.38s eta: 3 days, 13:20:24
|
| 270 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 31993.0 tgs: 61 data_time: 0.72s time: 520.66s eta: 3 days, 12:45:08
|
| 272 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 32365.0 tgs: 62 data_time: 0.68s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.397 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31497.0 tgs: 60 data_time: 0.83s time: 520.37s eta: 3 days, 12:24:56
|
| 276 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31507.0 tgs: 60 data_time: 0.87s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.4GB text_tokens: 29899.0 tgs: 57 data_time: 0.78s time: 520.66s eta: 3 days, 12:10:23
|
| 280 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 32129.0 tgs: 61 data_time: 0.86s time: 519.96s eta: 3 days, 11:54:59
|
| 282 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.7GB text_tokens: 31562.0 tgs: 60 data_time: 0.89s time: 521.29s eta: 3 days, 11:59:08
|
| 284 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 30596.0 tgs: 58 data_time: 0.82s time: 524.13s eta: 3 days, 12:17:50
|
| 286 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31446.0 tgs: 60 data_time: 0.91s time: 520.54s eta: 3 days, 11:34:33
|
| 288 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.5GB text_tokens: 30935.0 tgs: 59 data_time: 0.62s time: 518.78s eta: 3 days, 11:08:54
|
| 290 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31582.0 tgs: 60 data_time: 0.72s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32037.0 tgs: 61 data_time: 0.78s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32131.0 tgs: 61 data_time: 0.70s time: 520.49s eta: 3 days, 10:59:20
|
| 296 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 32342.0 tgs: 62 data_time: 0.74s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31716.0 tgs: 60 data_time: 0.66s time: 522.84s eta: 3 days, 11:04:25
|
| 300 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31956.0 tgs: 61 data_time: 0.90s time: 523.50s eta: 3 days, 11:01:57
|
| 302 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 32071.0 tgs: 61 data_time: 1.02s time: 520.96s eta: 3 days, 10:29:06
|
| 304 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31325.0 tgs: 60 data_time: 0.71s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.6GB text_tokens: 30772.0 tgs: 58 data_time: 0.95s time: 523.30s eta: 3 days, 10:33:52
|
| 308 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.348 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.02s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.311 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.9GB text_tokens: 31970.0 tgs: 61 data_time: 1.04s time: 521.32s eta: 3 days, 9:57:45
|
| 312 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32184.0 tgs: 61 data_time: 0.62s time: 520.64s eta: 3 days, 9:42:40
|
| 314 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31024.0 tgs: 59 data_time: 0.91s time: 522.41s eta: 3 days, 9:50:38
|
| 316 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.4GB text_tokens: 30697.0 tgs: 58 data_time: 0.94s time: 523.75s eta: 3 days, 9:54:33
|
| 318 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 32236.0 tgs: 61 data_time: 0.76s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.212 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31376.0 tgs: 60 data_time: 0.71s time: 520.68s eta: 3 days, 9:08:21
|
| 322 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31253.0 tgs: 59 data_time: 0.67s time: 521.26s eta: 3 days, 9:05:06
|
| 324 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 31.7GB text_tokens: 27839.0 tgs: 53 data_time: 0.68s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.237 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31971.0 tgs: 61 data_time: 0.84s time: 520.96s eta: 3 days, 8:44:56
|
| 328 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 32205.0 tgs: 61 data_time: 0.74s time: 520.17s eta: 3 days, 8:28:55
|
| 330 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.193 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.5GB text_tokens: 30720.0 tgs: 58 data_time: 0.76s time: 522.56s eta: 3 days, 8:42:26
|
| 332 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 32017.0 tgs: 61 data_time: 0.89s time: 524.33s eta: 3 days, 8:50:04
|
| 334 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31169.0 tgs: 59 data_time: 0.62s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32306.0 tgs: 62 data_time: 0.62s time: 520.09s eta: 3 days, 7:53:28
|
| 338 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 30552.0 tgs: 58 data_time: 0.75s time: 522.00s eta: 3 days, 8:02:24
|
| 340 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.198 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 31.9GB text_tokens: 27829.0 tgs: 53 data_time: 0.64s time: 524.36s eta: 3 days, 8:15:19
|
| 342 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32389.0 tgs: 62 data_time: 0.62s time: 520.50s eta: 3 days, 7:31:17
|
| 344 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31067.0 tgs: 59 data_time: 0.67s time: 518.57s eta: 3 days, 7:04:52
|
| 346 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32547.0 tgs: 62 data_time: 0.82s time: 523.16s eta: 3 days, 7:38:09
|
| 348 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 32221.0 tgs: 61 data_time: 0.73s time: 523.63s eta: 3 days, 7:33:43
|
| 350 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31075.0 tgs: 59 data_time: 0.56s time: 520.16s eta: 3 days, 6:53:28
|
| 352 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.9GB text_tokens: 30517.0 tgs: 58 data_time: 0.74s time: 520.06s eta: 3 days, 6:43:51
|
| 354 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.7GB text_tokens: 31792.0 tgs: 60 data_time: 0.83s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31413.0 tgs: 59 data_time: 1.05s time: 523.78s eta: 3 days, 7:00:13
|
| 358 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31691.0 tgs: 60 data_time: 0.82s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32130.0 tgs: 61 data_time: 0.65s time: 518.78s eta: 3 days, 5:57:38
|
| 362 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.7GB text_tokens: 31532.0 tgs: 60 data_time: 0.58s time: 522.67s eta: 3 days, 6:24:01
|
| 364 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.4GB text_tokens: 31076.0 tgs: 59 data_time: 0.82s time: 522.72s eta: 3 days, 6:15:47
|
| 366 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31660.0 tgs: 60 data_time: 0.70s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32304.0 tgs: 62 data_time: 0.67s time: 519.66s eta: 3 days, 5:30:57
|
| 370 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 31855.0 tgs: 61 data_time: 0.87s time: 521.30s eta: 3 days, 5:36:56
|
| 372 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 0.62s time: 523.59s eta: 3 days, 5:48:41
|
| 374 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 32321.0 tgs: 62 data_time: 0.88s time: 519.32s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32465.0 tgs: 62 data_time: 0.75s time: 520.64s eta: 3 days, 5:05:00
|
| 378 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.8GB text_tokens: 31067.0 tgs: 59 data_time: 0.73s time: 521.45s eta: 3 days, 5:03:32
|
| 380 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31472.0 tgs: 60 data_time: 0.82s time: 523.95s eta: 3 days, 5:16:55
|
| 382 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 30640.0 tgs: 59 data_time: 1.08s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.372 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 32323.0 tgs: 62 data_time: 0.79s time: 519.23s eta: 3 days, 4:17:55
|
| 386 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 32138.0 tgs: 61 data_time: 0.81s time: 520.72s eta: 3 days, 4:22:17
|
| 388 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 31246.0 tgs: 59 data_time: 0.73s time: 524.26s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 30657.0 tgs: 58 data_time: 0.62s time: 520.32s eta: 3 days, 4:01:29
|
| 392 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 32.7GB text_tokens: 31712.0 tgs: 61 data_time: 0.93s time: 519.50s eta: 3 days, 3:45:37
|
| 394 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 56][DP 14][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31495.0 tgs: 60 data_time: 0.73s time: 522.32s eta: 3 days, 4:01:33
|
20250120235238/rank59.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 142.74 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 32438.0 tgs: 59 data_time: 2.40s time: 546.74s eta: 3 days, 18:03:35
|
| 258 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31049.0 tgs: 59 data_time: 0.93s time: 523.25s eta: 3 days, 14:02:41
|
| 260 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.9GB text_tokens: 31400.0 tgs: 60 data_time: 1.01s time: 522.87s eta: 3 days, 13:50:15
|
| 262 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.0GB text_tokens: 31528.0 tgs: 60 data_time: 0.74s time: 520.29s eta: 3 days, 13:16:13
|
| 264 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.8GB text_tokens: 31808.0 tgs: 61 data_time: 0.96s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 32312.0 tgs: 62 data_time: 0.74s time: 520.91s eta: 3 days, 13:04:56
|
| 268 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 31349.0 tgs: 59 data_time: 0.73s time: 523.39s eta: 3 days, 13:20:28
|
| 270 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 31993.0 tgs: 61 data_time: 0.71s time: 520.67s eta: 3 days, 12:45:09
|
| 272 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.0GB text_tokens: 32365.0 tgs: 62 data_time: 0.70s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31497.0 tgs: 60 data_time: 0.85s time: 520.37s eta: 3 days, 12:24:56
|
| 276 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.1GB text_tokens: 31507.0 tgs: 60 data_time: 0.86s time: 524.53s eta: 3 days, 12:56:43
|
| 278 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.4GB text_tokens: 29899.0 tgs: 57 data_time: 0.77s time: 520.65s eta: 3 days, 12:10:19
|
| 280 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.9GB text_tokens: 32129.0 tgs: 61 data_time: 0.86s time: 519.97s eta: 3 days, 11:55:02
|
| 282 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.7GB text_tokens: 31562.0 tgs: 60 data_time: 0.88s time: 521.29s eta: 3 days, 11:59:09
|
| 284 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 30596.0 tgs: 58 data_time: 0.83s time: 524.13s eta: 3 days, 12:17:50
|
| 286 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 31446.0 tgs: 60 data_time: 0.93s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.5GB text_tokens: 30935.0 tgs: 59 data_time: 0.63s time: 518.78s eta: 3 days, 11:08:55
|
| 290 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31582.0 tgs: 60 data_time: 0.73s time: 522.11s eta: 3 days, 11:32:14
|
| 292 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 32037.0 tgs: 61 data_time: 0.79s time: 523.83s eta: 3 days, 11:40:04
|
| 294 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32131.0 tgs: 61 data_time: 0.71s time: 520.49s eta: 3 days, 10:59:20
|
| 296 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.1GB text_tokens: 32342.0 tgs: 62 data_time: 0.75s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31716.0 tgs: 60 data_time: 0.66s time: 522.84s eta: 3 days, 11:04:26
|
| 300 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 31956.0 tgs: 61 data_time: 0.87s time: 523.50s eta: 3 days, 11:01:57
|
| 302 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.0GB text_tokens: 32071.0 tgs: 61 data_time: 1.01s time: 520.96s eta: 3 days, 10:29:08
|
| 304 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 33.1GB text_tokens: 31325.0 tgs: 60 data_time: 0.68s time: 519.10s eta: 3 days, 10:02:49
|
| 306 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.6GB text_tokens: 30772.0 tgs: 58 data_time: 0.94s time: 523.30s eta: 3 days, 10:33:52
|
| 308 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.01s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 32.9GB text_tokens: 31970.0 tgs: 61 data_time: 1.04s time: 521.32s eta: 3 days, 9:57:46
|
| 312 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 33.1GB text_tokens: 32184.0 tgs: 61 data_time: 0.62s time: 520.64s eta: 3 days, 9:42:41
|
| 314 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 31024.0 tgs: 59 data_time: 0.91s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.4GB text_tokens: 30697.0 tgs: 58 data_time: 0.95s time: 523.76s eta: 3 days, 9:54:34
|
| 318 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 32236.0 tgs: 61 data_time: 0.77s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 31376.0 tgs: 60 data_time: 0.72s time: 520.68s eta: 3 days, 9:08:20
|
| 322 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 32.9GB text_tokens: 31253.0 tgs: 59 data_time: 0.67s time: 521.26s eta: 3 days, 9:05:07
|
| 324 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 31.7GB text_tokens: 27839.0 tgs: 53 data_time: 0.68s time: 524.11s eta: 3 days, 9:22:58
|
| 326 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31971.0 tgs: 61 data_time: 0.85s time: 520.96s eta: 3 days, 8:44:56
|
| 328 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 32205.0 tgs: 61 data_time: 0.74s time: 520.17s eta: 3 days, 8:28:56
|
| 330 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.5GB text_tokens: 30720.0 tgs: 58 data_time: 0.78s time: 522.57s eta: 3 days, 8:42:26
|
| 332 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 32017.0 tgs: 61 data_time: 0.89s time: 524.33s eta: 3 days, 8:50:05
|
| 334 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31169.0 tgs: 59 data_time: 0.63s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32306.0 tgs: 62 data_time: 0.62s time: 520.09s eta: 3 days, 7:53:29
|
| 338 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.0GB text_tokens: 30552.0 tgs: 58 data_time: 0.74s time: 522.00s eta: 3 days, 8:02:24
|
| 340 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 31.9GB text_tokens: 27829.0 tgs: 53 data_time: 0.64s time: 524.35s eta: 3 days, 8:15:19
|
| 342 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 32389.0 tgs: 62 data_time: 0.58s time: 520.51s eta: 3 days, 7:31:19
|
| 344 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31067.0 tgs: 59 data_time: 0.69s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 32547.0 tgs: 62 data_time: 0.82s time: 523.16s eta: 3 days, 7:38:09
|
| 348 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 32221.0 tgs: 61 data_time: 0.77s time: 523.63s eta: 3 days, 7:33:44
|
| 350 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31075.0 tgs: 59 data_time: 0.56s time: 520.16s eta: 3 days, 6:53:28
|
| 352 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 32.9GB text_tokens: 30517.0 tgs: 58 data_time: 0.74s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.7GB text_tokens: 31792.0 tgs: 60 data_time: 0.81s time: 523.76s eta: 3 days, 7:08:46
|
| 356 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31413.0 tgs: 59 data_time: 1.04s time: 523.79s eta: 3 days, 7:00:17
|
| 358 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31691.0 tgs: 60 data_time: 0.83s time: 520.24s eta: 3 days, 6:19:29
|
| 360 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32130.0 tgs: 61 data_time: 0.67s time: 518.78s eta: 3 days, 5:57:39
|
| 362 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.7GB text_tokens: 31532.0 tgs: 60 data_time: 0.59s time: 522.67s eta: 3 days, 6:24:00
|
| 364 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.4GB text_tokens: 31076.0 tgs: 59 data_time: 0.76s time: 522.73s eta: 3 days, 6:15:49
|
| 366 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31660.0 tgs: 60 data_time: 0.73s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32304.0 tgs: 62 data_time: 0.66s time: 519.66s eta: 3 days, 5:30:57
|
| 370 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 31855.0 tgs: 61 data_time: 0.86s time: 521.30s eta: 3 days, 5:36:57
|
| 372 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 0.61s time: 523.60s eta: 3 days, 5:48:43
|
| 374 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.0GB text_tokens: 32321.0 tgs: 62 data_time: 0.87s time: 519.32s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32465.0 tgs: 62 data_time: 0.74s time: 520.64s eta: 3 days, 5:05:01
|
| 378 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.8GB text_tokens: 31067.0 tgs: 59 data_time: 0.74s time: 521.45s eta: 3 days, 5:03:33
|
| 380 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31472.0 tgs: 60 data_time: 0.80s time: 523.95s eta: 3 days, 5:16:57
|
| 382 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 30640.0 tgs: 59 data_time: 1.07s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 32323.0 tgs: 62 data_time: 0.78s time: 519.24s eta: 3 days, 4:17:55
|
| 386 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 32138.0 tgs: 61 data_time: 0.81s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 31246.0 tgs: 59 data_time: 0.73s time: 524.26s eta: 3 days, 4:44:43
|
| 390 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 30657.0 tgs: 58 data_time: 0.62s time: 520.32s eta: 3 days, 4:01:30
|
| 392 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 32.7GB text_tokens: 31712.0 tgs: 61 data_time: 0.93s time: 519.50s eta: 3 days, 3:45:36
|
| 394 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 59][DP 14][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: nan loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31495.0 tgs: 60 data_time: 0.72s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank6.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.13s
|
| 12 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:07:53][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 149.88 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.6GB text_tokens: 31121.0 tgs: 56 data_time: 2.32s time: 546.76s eta: 3 days, 18:03:47
|
| 258 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31200.0 tgs: 59 data_time: 0.88s time: 523.22s eta: 3 days, 14:02:24
|
| 260 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 30861.0 tgs: 59 data_time: 0.98s time: 522.94s eta: 3 days, 13:50:56
|
| 262 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.99s time: 520.27s eta: 3 days, 13:16:01
|
| 264 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.27s time: 520.97s eta: 3 days, 13:14:08
|
| 266 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.324 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31899.0 tgs: 61 data_time: 0.82s time: 520.91s eta: 3 days, 13:04:54
|
| 268 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32173.0 tgs: 61 data_time: 1.02s time: 523.37s eta: 3 days, 13:20:16
|
| 270 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32073.0 tgs: 61 data_time: 0.86s time: 520.65s eta: 3 days, 12:44:59
|
| 272 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 0.86s time: 520.16s eta: 3 days, 12:31:32
|
| 274 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32211.0 tgs: 61 data_time: 0.72s time: 520.47s eta: 3 days, 12:25:56
|
| 276 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31612.0 tgs: 60 data_time: 0.95s time: 524.52s eta: 3 days, 12:56:32
|
| 278 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31664.0 tgs: 60 data_time: 0.71s time: 520.63s eta: 3 days, 12:10:08
|
| 280 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31279.0 tgs: 60 data_time: 0.71s time: 519.96s eta: 3 days, 11:54:56
|
| 282 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31574.0 tgs: 60 data_time: 0.88s time: 521.27s eta: 3 days, 11:58:57
|
| 284 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 31608.0 tgs: 60 data_time: 0.74s time: 524.11s eta: 3 days, 12:17:37
|
| 286 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32396.0 tgs: 62 data_time: 0.88s time: 520.52s eta: 3 days, 11:34:22
|
| 288 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32445.0 tgs: 62 data_time: 0.78s time: 518.89s eta: 3 days, 11:09:58
|
| 290 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32483.0 tgs: 62 data_time: 0.93s time: 522.09s eta: 3 days, 11:32:02
|
| 292 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.8GB text_tokens: 30947.0 tgs: 59 data_time: 0.90s time: 523.81s eta: 3 days, 11:39:52
|
| 294 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32390.0 tgs: 62 data_time: 0.81s time: 520.47s eta: 3 days, 10:59:08
|
| 296 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31866.0 tgs: 61 data_time: 0.88s time: 518.42s eta: 3 days, 10:30:52
|
| 298 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 31128.0 tgs: 59 data_time: 0.84s time: 522.82s eta: 3 days, 11:04:14
|
| 300 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.348 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32297.0 tgs: 61 data_time: 0.70s time: 523.52s eta: 3 days, 11:02:10
|
| 302 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32495.0 tgs: 62 data_time: 1.06s time: 521.01s eta: 3 days, 10:29:35
|
| 304 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31533.0 tgs: 60 data_time: 0.74s time: 519.08s eta: 3 days, 10:02:39
|
| 306 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.4GB text_tokens: 31074.0 tgs: 59 data_time: 0.78s time: 523.28s eta: 3 days, 10:33:40
|
| 308 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.387 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32081.0 tgs: 61 data_time: 0.88s time: 522.97s eta: 3 days, 10:22:01
|
| 310 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31457.0 tgs: 60 data_time: 0.82s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31099.0 tgs: 59 data_time: 0.74s time: 520.62s eta: 3 days, 9:42:31
|
| 314 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32202.0 tgs: 61 data_time: 0.76s time: 522.42s eta: 3 days, 9:50:42
|
| 316 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 30996.0 tgs: 59 data_time: 0.71s time: 523.77s eta: 3 days, 9:54:42
|
| 318 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31401.0 tgs: 60 data_time: 0.93s time: 520.41s eta: 3 days, 9:14:29
|
| 320 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.7GB text_tokens: 31506.0 tgs: 60 data_time: 0.90s time: 520.66s eta: 3 days, 9:08:10
|
| 322 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31849.0 tgs: 61 data_time: 0.83s time: 521.32s eta: 3 days, 9:05:39
|
| 324 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31503.0 tgs: 60 data_time: 0.68s time: 524.09s eta: 3 days, 9:22:47
|
| 326 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32241.0 tgs: 61 data_time: 0.81s time: 520.94s eta: 3 days, 8:44:44
|
| 328 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.86s time: 520.21s eta: 3 days, 8:29:18
|
| 330 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 29886.0 tgs: 57 data_time: 1.04s time: 522.55s eta: 3 days, 8:42:17
|
| 332 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.80s time: 524.31s eta: 3 days, 8:49:54
|
| 334 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31388.0 tgs: 60 data_time: 0.96s time: 519.87s eta: 3 days, 8:00:06
|
| 336 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31748.0 tgs: 61 data_time: 0.84s time: 520.12s eta: 3 days, 7:53:45
|
| 338 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.321 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 31342.0 tgs: 60 data_time: 0.70s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 31840.0 tgs: 60 data_time: 0.93s time: 524.33s eta: 3 days, 8:15:08
|
| 342 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31789.0 tgs: 61 data_time: 0.66s time: 520.56s eta: 3 days, 7:31:45
|
| 344 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31941.0 tgs: 61 data_time: 0.83s time: 518.55s eta: 3 days, 7:04:42
|
| 346 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.2GB text_tokens: 30569.0 tgs: 58 data_time: 0.75s time: 523.14s eta: 3 days, 7:37:58
|
| 348 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.87s time: 523.61s eta: 3 days, 7:33:32
|
| 350 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.201 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.6GB text_tokens: 31568.0 tgs: 60 data_time: 0.80s time: 520.20s eta: 3 days, 6:53:50
|
| 352 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31707.0 tgs: 60 data_time: 1.09s time: 520.04s eta: 3 days, 6:43:40
|
| 354 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31866.0 tgs: 60 data_time: 0.92s time: 523.74s eta: 3 days, 7:08:34
|
| 356 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31584.0 tgs: 60 data_time: 0.96s time: 523.82s eta: 3 days, 7:00:36
|
| 358 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31870.0 tgs: 61 data_time: 0.96s time: 520.22s eta: 3 days, 6:19:20
|
| 360 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.8GB text_tokens: 30551.0 tgs: 58 data_time: 0.60s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.230 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31961.0 tgs: 61 data_time: 0.83s time: 522.67s eta: 3 days, 6:24:00
|
| 364 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 1.00s time: 522.77s eta: 3 days, 6:16:11
|
| 366 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 1.12s time: 520.51s eta: 3 days, 5:47:13
|
| 368 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31804.0 tgs: 61 data_time: 0.72s time: 519.64s eta: 3 days, 5:30:47
|
| 370 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32278.0 tgs: 61 data_time: 0.99s time: 521.33s eta: 3 days, 5:37:15
|
| 372 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31927.0 tgs: 60 data_time: 0.81s time: 523.57s eta: 3 days, 5:48:31
|
| 374 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31369.0 tgs: 60 data_time: 0.98s time: 519.29s eta: 3 days, 5:01:40
|
| 376 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32214.0 tgs: 61 data_time: 0.83s time: 520.66s eta: 3 days, 5:05:14
|
| 378 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31816.0 tgs: 61 data_time: 0.95s time: 521.47s eta: 3 days, 5:03:41
|
| 380 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31958.0 tgs: 60 data_time: 0.64s time: 523.93s eta: 3 days, 5:16:45
|
| 382 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 1.05s time: 518.83s eta: 3 days, 4:23:01
|
| 384 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31565.0 tgs: 60 data_time: 0.81s time: 519.27s eta: 3 days, 4:18:14
|
| 386 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31753.0 tgs: 60 data_time: 0.71s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.5GB text_tokens: 30827.0 tgs: 58 data_time: 0.74s time: 524.24s eta: 3 days, 4:44:32
|
| 390 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.8GB text_tokens: 31476.0 tgs: 60 data_time: 1.07s time: 520.34s eta: 3 days, 4:01:37
|
| 392 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31389.0 tgs: 60 data_time: 0.44s time: 519.52s eta: 3 days, 3:45:47
|
| 394 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 6][DP 1][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 32.4GB text_tokens: 30909.0 tgs: 59 data_time: 0.92s time: 522.30s eta: 3 days, 4:01:24
|
20250120235238/rank60.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.12s
|
| 12 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 142.73 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.9GB text_tokens: 31861.0 tgs: 58 data_time: 1.74s time: 546.72s eta: 3 days, 18:03:26
|
| 258 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31345.0 tgs: 59 data_time: 0.63s time: 523.25s eta: 3 days, 14:02:41
|
| 260 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 30897.0 tgs: 59 data_time: 0.80s time: 522.87s eta: 3 days, 13:50:15
|
| 262 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.7GB text_tokens: 30727.0 tgs: 59 data_time: 0.82s time: 520.29s eta: 3 days, 13:16:13
|
| 264 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31993.0 tgs: 61 data_time: 0.64s time: 520.99s eta: 3 days, 13:14:20
|
| 266 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 32528.0 tgs: 62 data_time: 0.78s time: 520.91s eta: 3 days, 13:04:55
|
| 268 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31985.0 tgs: 61 data_time: 0.59s time: 523.39s eta: 3 days, 13:20:28
|
| 270 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.205 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32411.0 tgs: 62 data_time: 0.81s time: 520.67s eta: 3 days, 12:45:09
|
| 272 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 32.6GB text_tokens: 30816.0 tgs: 59 data_time: 0.67s time: 520.18s eta: 3 days, 12:31:45
|
| 274 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 32133.0 tgs: 61 data_time: 1.06s time: 520.37s eta: 3 days, 12:24:55
|
| 276 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.8GB text_tokens: 31934.0 tgs: 60 data_time: 0.81s time: 524.54s eta: 3 days, 12:56:44
|
| 278 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31332.0 tgs: 60 data_time: 0.82s time: 520.65s eta: 3 days, 12:10:19
|
| 280 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 32182.0 tgs: 61 data_time: 0.83s time: 519.97s eta: 3 days, 11:55:03
|
| 282 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 30812.0 tgs: 59 data_time: 0.57s time: 521.29s eta: 3 days, 11:59:08
|
| 284 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32580.0 tgs: 62 data_time: 0.65s time: 524.13s eta: 3 days, 12:17:49
|
| 286 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.7GB text_tokens: 30706.0 tgs: 58 data_time: 0.79s time: 520.54s eta: 3 days, 11:34:33
|
| 288 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 31982.0 tgs: 61 data_time: 0.79s time: 518.78s eta: 3 days, 11:08:55
|
| 290 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 32212.0 tgs: 61 data_time: 0.83s time: 522.11s eta: 3 days, 11:32:15
|
| 292 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 31965.0 tgs: 61 data_time: 0.92s time: 523.83s eta: 3 days, 11:40:03
|
| 294 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 31887.0 tgs: 61 data_time: 0.82s time: 520.49s eta: 3 days, 10:59:20
|
| 296 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.218 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31686.0 tgs: 61 data_time: 0.96s time: 518.43s eta: 3 days, 10:31:02
|
| 298 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31729.0 tgs: 60 data_time: 0.71s time: 522.84s eta: 3 days, 11:04:25
|
| 300 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31873.0 tgs: 60 data_time: 0.78s time: 523.50s eta: 3 days, 11:01:57
|
| 302 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31838.0 tgs: 61 data_time: 0.75s time: 520.96s eta: 3 days, 10:29:08
|
| 304 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31923.0 tgs: 61 data_time: 0.85s time: 519.10s eta: 3 days, 10:02:49
|
| 306 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31822.0 tgs: 60 data_time: 0.71s time: 523.29s eta: 3 days, 10:33:51
|
| 308 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.300 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.9GB text_tokens: 32115.0 tgs: 61 data_time: 1.16s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32433.0 tgs: 62 data_time: 0.90s time: 521.32s eta: 3 days, 9:57:46
|
| 312 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.8GB text_tokens: 31453.0 tgs: 60 data_time: 0.93s time: 520.64s eta: 3 days, 9:42:41
|
| 314 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32050.0 tgs: 61 data_time: 0.90s time: 522.41s eta: 3 days, 9:50:38
|
| 316 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 32259.0 tgs: 61 data_time: 0.68s time: 523.76s eta: 3 days, 9:54:34
|
| 318 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 32000.0 tgs: 61 data_time: 0.73s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.6GB text_tokens: 30717.0 tgs: 58 data_time: 0.75s time: 520.68s eta: 3 days, 9:08:21
|
| 322 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31683.0 tgs: 60 data_time: 0.85s time: 521.26s eta: 3 days, 9:05:07
|
| 324 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.363 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32425.0 tgs: 61 data_time: 1.09s time: 524.11s eta: 3 days, 9:22:59
|
| 326 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31127.0 tgs: 59 data_time: 0.64s time: 520.96s eta: 3 days, 8:44:56
|
| 328 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31857.0 tgs: 61 data_time: 0.78s time: 520.17s eta: 3 days, 8:28:56
|
| 330 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 31766.0 tgs: 60 data_time: 0.78s time: 522.57s eta: 3 days, 8:42:27
|
| 332 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 32158.0 tgs: 61 data_time: 0.88s time: 524.33s eta: 3 days, 8:50:05
|
| 334 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 32032.0 tgs: 61 data_time: 0.83s time: 519.89s eta: 3 days, 8:00:18
|
| 336 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 31101.0 tgs: 59 data_time: 0.74s time: 520.09s eta: 3 days, 7:53:29
|
| 338 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.4GB text_tokens: 30739.0 tgs: 58 data_time: 0.91s time: 522.00s eta: 3 days, 8:02:24
|
| 340 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32313.0 tgs: 61 data_time: 0.82s time: 524.36s eta: 3 days, 8:15:20
|
| 342 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.8GB text_tokens: 30208.0 tgs: 58 data_time: 0.67s time: 520.51s eta: 3 days, 7:31:18
|
| 344 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31514.0 tgs: 60 data_time: 0.93s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31887.0 tgs: 60 data_time: 0.90s time: 523.16s eta: 3 days, 7:38:09
|
| 348 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.314 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 31263.0 tgs: 59 data_time: 0.71s time: 523.63s eta: 3 days, 7:33:43
|
| 350 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31940.0 tgs: 61 data_time: 0.89s time: 520.16s eta: 3 days, 6:53:29
|
| 352 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 1.25s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31041.0 tgs: 59 data_time: 1.02s time: 523.76s eta: 3 days, 7:08:46
|
| 356 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31939.0 tgs: 60 data_time: 0.64s time: 523.79s eta: 3 days, 7:00:15
|
| 358 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31788.0 tgs: 61 data_time: 1.17s time: 520.24s eta: 3 days, 6:19:31
|
| 360 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32224.0 tgs: 62 data_time: 0.70s time: 518.78s eta: 3 days, 5:57:39
|
| 362 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.236 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32035.0 tgs: 61 data_time: 0.72s time: 522.67s eta: 3 days, 6:24:00
|
| 364 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.3GB text_tokens: 30404.0 tgs: 58 data_time: 0.75s time: 522.72s eta: 3 days, 6:15:48
|
| 366 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.310 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 32351.0 tgs: 62 data_time: 1.08s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32058.0 tgs: 61 data_time: 0.68s time: 519.66s eta: 3 days, 5:30:58
|
| 370 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 32084.0 tgs: 61 data_time: 0.92s time: 521.30s eta: 3 days, 5:36:57
|
| 372 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31472.0 tgs: 60 data_time: 0.70s time: 523.59s eta: 3 days, 5:48:42
|
| 374 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31503.0 tgs: 60 data_time: 0.80s time: 519.33s eta: 3 days, 5:01:59
|
| 376 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32460.0 tgs: 62 data_time: 0.68s time: 520.64s eta: 3 days, 5:05:00
|
| 378 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31680.0 tgs: 60 data_time: 0.43s time: 521.45s eta: 3 days, 5:03:33
|
| 380 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31919.0 tgs: 60 data_time: 0.69s time: 523.95s eta: 3 days, 5:16:56
|
| 382 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.217 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 32.9GB text_tokens: 31721.0 tgs: 61 data_time: 0.77s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 32329.0 tgs: 62 data_time: 0.70s time: 519.20s eta: 3 days, 4:17:35
|
| 386 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.8GB text_tokens: 31333.0 tgs: 60 data_time: 0.78s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 27236.0 tgs: 51 data_time: 0.62s time: 524.26s eta: 3 days, 4:44:43
|
| 390 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.9GB text_tokens: 32171.0 tgs: 61 data_time: 0.83s time: 520.32s eta: 3 days, 4:01:30
|
| 392 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32389.0 tgs: 62 data_time: 1.00s time: 519.50s eta: 3 days, 3:45:36
|
| 394 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 60][DP 15][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31521.0 tgs: 60 data_time: 0.94s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank62.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.11s
|
| 12 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:00][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:08:01][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 142.52 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.9GB text_tokens: 31861.0 tgs: 58 data_time: 1.74s time: 546.73s eta: 3 days, 18:03:31
|
| 258 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31345.0 tgs: 59 data_time: 0.61s time: 523.25s eta: 3 days, 14:02:41
|
| 260 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 30897.0 tgs: 59 data_time: 0.79s time: 522.87s eta: 3 days, 13:50:15
|
| 262 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.7GB text_tokens: 30727.0 tgs: 59 data_time: 0.80s time: 520.29s eta: 3 days, 13:16:13
|
| 264 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31993.0 tgs: 61 data_time: 0.57s time: 520.99s eta: 3 days, 13:14:21
|
| 266 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.0GB text_tokens: 32528.0 tgs: 62 data_time: 0.69s time: 520.91s eta: 3 days, 13:04:55
|
| 268 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.270 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.9GB text_tokens: 31985.0 tgs: 61 data_time: 0.57s time: 523.39s eta: 3 days, 13:20:28
|
| 270 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32411.0 tgs: 62 data_time: 0.74s time: 520.66s eta: 3 days, 12:45:09
|
| 272 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 32.6GB text_tokens: 30816.0 tgs: 59 data_time: 0.66s time: 520.18s eta: 3 days, 12:31:46
|
| 274 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.9GB text_tokens: 32133.0 tgs: 61 data_time: 1.03s time: 520.37s eta: 3 days, 12:24:55
|
| 276 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.8GB text_tokens: 31934.0 tgs: 60 data_time: 0.80s time: 524.54s eta: 3 days, 12:56:44
|
| 278 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 32.9GB text_tokens: 31332.0 tgs: 60 data_time: 0.80s time: 520.65s eta: 3 days, 12:10:19
|
| 280 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 32182.0 tgs: 61 data_time: 0.81s time: 519.97s eta: 3 days, 11:55:02
|
| 282 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.9GB text_tokens: 30812.0 tgs: 59 data_time: 0.57s time: 521.29s eta: 3 days, 11:59:09
|
| 284 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 32580.0 tgs: 62 data_time: 0.65s time: 524.13s eta: 3 days, 12:17:49
|
| 286 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 32.7GB text_tokens: 30706.0 tgs: 58 data_time: 0.75s time: 520.54s eta: 3 days, 11:34:32
|
| 288 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 32.9GB text_tokens: 31982.0 tgs: 61 data_time: 0.77s time: 518.78s eta: 3 days, 11:08:55
|
| 290 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 32212.0 tgs: 61 data_time: 0.81s time: 522.11s eta: 3 days, 11:32:14
|
| 292 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 33.0GB text_tokens: 31965.0 tgs: 61 data_time: 0.86s time: 523.83s eta: 3 days, 11:40:04
|
| 294 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 31887.0 tgs: 61 data_time: 0.82s time: 520.49s eta: 3 days, 10:59:20
|
| 296 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 31686.0 tgs: 61 data_time: 0.96s time: 518.44s eta: 3 days, 10:31:03
|
| 298 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.0GB text_tokens: 31729.0 tgs: 60 data_time: 0.69s time: 522.84s eta: 3 days, 11:04:25
|
| 300 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 31873.0 tgs: 60 data_time: 0.74s time: 523.50s eta: 3 days, 11:01:57
|
| 302 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 31838.0 tgs: 61 data_time: 0.78s time: 520.96s eta: 3 days, 10:29:08
|
| 304 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31923.0 tgs: 61 data_time: 0.87s time: 519.10s eta: 3 days, 10:02:49
|
| 306 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 31822.0 tgs: 60 data_time: 0.71s time: 523.30s eta: 3 days, 10:33:52
|
| 308 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 32.9GB text_tokens: 32115.0 tgs: 61 data_time: 1.15s time: 522.95s eta: 3 days, 10:21:51
|
| 310 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.339 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32433.0 tgs: 62 data_time: 0.86s time: 521.32s eta: 3 days, 9:57:46
|
| 312 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.8GB text_tokens: 31453.0 tgs: 60 data_time: 0.91s time: 520.64s eta: 3 days, 9:42:41
|
| 314 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32050.0 tgs: 61 data_time: 0.89s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.343 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 32259.0 tgs: 61 data_time: 0.63s time: 523.75s eta: 3 days, 9:54:33
|
| 318 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 32.9GB text_tokens: 32000.0 tgs: 61 data_time: 0.71s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.6GB text_tokens: 30717.0 tgs: 58 data_time: 0.72s time: 520.68s eta: 3 days, 9:08:21
|
| 322 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 31683.0 tgs: 60 data_time: 0.81s time: 521.26s eta: 3 days, 9:05:07
|
| 324 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.320 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.1GB text_tokens: 32425.0 tgs: 61 data_time: 1.04s time: 524.11s eta: 3 days, 9:22:59
|
| 326 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.1GB text_tokens: 31127.0 tgs: 59 data_time: 0.61s time: 520.96s eta: 3 days, 8:44:56
|
| 328 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.1GB text_tokens: 31857.0 tgs: 61 data_time: 0.74s time: 520.17s eta: 3 days, 8:28:56
|
| 330 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 33.0GB text_tokens: 31766.0 tgs: 60 data_time: 0.80s time: 522.57s eta: 3 days, 8:42:27
|
| 332 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.9GB text_tokens: 32158.0 tgs: 61 data_time: 0.84s time: 524.33s eta: 3 days, 8:50:05
|
| 334 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.0GB text_tokens: 32032.0 tgs: 61 data_time: 0.81s time: 519.89s eta: 3 days, 8:00:17
|
| 336 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 32.8GB text_tokens: 31101.0 tgs: 59 data_time: 0.72s time: 520.09s eta: 3 days, 7:53:29
|
| 338 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.4GB text_tokens: 30739.0 tgs: 58 data_time: 0.91s time: 522.00s eta: 3 days, 8:02:25
|
| 340 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.355 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.1GB text_tokens: 32313.0 tgs: 61 data_time: 0.80s time: 524.35s eta: 3 days, 8:15:19
|
| 342 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.208 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 32.8GB text_tokens: 30208.0 tgs: 58 data_time: 0.65s time: 520.51s eta: 3 days, 7:31:18
|
| 344 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31514.0 tgs: 60 data_time: 0.87s time: 518.57s eta: 3 days, 7:04:53
|
| 346 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.1GB text_tokens: 31887.0 tgs: 60 data_time: 0.86s time: 523.16s eta: 3 days, 7:38:09
|
| 348 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 31263.0 tgs: 59 data_time: 0.70s time: 523.63s eta: 3 days, 7:33:43
|
| 350 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31940.0 tgs: 61 data_time: 0.86s time: 520.16s eta: 3 days, 6:53:29
|
| 352 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 1.17s time: 520.06s eta: 3 days, 6:43:51
|
| 354 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.0GB text_tokens: 31041.0 tgs: 59 data_time: 0.97s time: 523.76s eta: 3 days, 7:08:46
|
| 356 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.246 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31939.0 tgs: 60 data_time: 0.61s time: 523.79s eta: 3 days, 7:00:15
|
| 358 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31788.0 tgs: 61 data_time: 1.15s time: 520.24s eta: 3 days, 6:19:30
|
| 360 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 33.1GB text_tokens: 32224.0 tgs: 62 data_time: 0.69s time: 518.78s eta: 3 days, 5:57:39
|
| 362 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.0GB text_tokens: 32035.0 tgs: 61 data_time: 0.70s time: 522.67s eta: 3 days, 6:24:00
|
| 364 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 32.3GB text_tokens: 30404.0 tgs: 58 data_time: 0.69s time: 522.72s eta: 3 days, 6:15:48
|
| 366 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.349 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 32351.0 tgs: 62 data_time: 1.03s time: 520.53s eta: 3 days, 5:47:24
|
| 368 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 32058.0 tgs: 61 data_time: 0.63s time: 519.66s eta: 3 days, 5:30:57
|
| 370 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 32.9GB text_tokens: 32084.0 tgs: 61 data_time: 0.88s time: 521.30s eta: 3 days, 5:36:57
|
| 372 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31472.0 tgs: 60 data_time: 0.65s time: 523.59s eta: 3 days, 5:48:42
|
| 374 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 33.1GB text_tokens: 31503.0 tgs: 60 data_time: 0.78s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.332 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 32460.0 tgs: 62 data_time: 0.66s time: 520.64s eta: 3 days, 5:05:00
|
| 378 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31680.0 tgs: 60 data_time: 0.47s time: 521.45s eta: 3 days, 5:03:33
|
| 380 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.288 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31919.0 tgs: 60 data_time: 0.75s time: 523.95s eta: 3 days, 5:16:56
|
| 382 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 32.9GB text_tokens: 31721.0 tgs: 61 data_time: 0.77s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 32329.0 tgs: 62 data_time: 0.70s time: 519.24s eta: 3 days, 4:17:55
|
| 386 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.8GB text_tokens: 31333.0 tgs: 60 data_time: 0.77s time: 520.72s eta: 3 days, 4:22:18
|
| 388 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.1GB text_tokens: 27236.0 tgs: 51 data_time: 0.59s time: 524.26s eta: 3 days, 4:44:42
|
| 390 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.9GB text_tokens: 32171.0 tgs: 61 data_time: 0.80s time: 520.33s eta: 3 days, 4:01:31
|
| 392 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.374 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 32389.0 tgs: 62 data_time: 0.95s time: 519.50s eta: 3 days, 3:45:35
|
| 394 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 62][DP 15][SP 2][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.293 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.1GB text_tokens: 31521.0 tgs: 60 data_time: 0.96s time: 522.32s eta: 3 days, 4:01:35
|
20250120235238/rank7.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:54:30][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.31s
|
| 12 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:57][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:07:58][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 145.84 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 32.6GB text_tokens: 31121.0 tgs: 56 data_time: 2.17s time: 546.75s eta: 3 days, 18:03:43
|
| 258 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 32.9GB text_tokens: 31200.0 tgs: 59 data_time: 0.83s time: 523.23s eta: 3 days, 14:02:30
|
| 260 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.335 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 32.8GB text_tokens: 30861.0 tgs: 59 data_time: 0.91s time: 522.94s eta: 3 days, 13:50:56
|
| 262 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 33.1GB text_tokens: 31960.0 tgs: 61 data_time: 0.94s time: 520.28s eta: 3 days, 13:16:02
|
| 264 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.299 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 32.9GB text_tokens: 31620.0 tgs: 60 data_time: 1.13s time: 520.97s eta: 3 days, 13:14:09
|
| 266 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.347 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 33.1GB text_tokens: 31899.0 tgs: 61 data_time: 0.76s time: 520.91s eta: 3 days, 13:04:55
|
| 268 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 33.1GB text_tokens: 32173.0 tgs: 61 data_time: 1.02s time: 523.37s eta: 3 days, 13:20:16
|
| 270 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.0GB text_tokens: 32073.0 tgs: 61 data_time: 0.75s time: 520.65s eta: 3 days, 12:44:58
|
| 272 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32230.0 tgs: 61 data_time: 0.81s time: 520.16s eta: 3 days, 12:31:34
|
| 274 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.291 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 33.1GB text_tokens: 32211.0 tgs: 61 data_time: 0.70s time: 520.47s eta: 3 days, 12:25:57
|
| 276 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 33.0GB text_tokens: 31612.0 tgs: 60 data_time: 0.90s time: 524.52s eta: 3 days, 12:56:33
|
| 278 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.242 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.1GB text_tokens: 31664.0 tgs: 60 data_time: 0.66s time: 520.63s eta: 3 days, 12:10:08
|
| 280 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 33.0GB text_tokens: 31279.0 tgs: 60 data_time: 0.66s time: 519.96s eta: 3 days, 11:54:57
|
| 282 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 32.8GB text_tokens: 31574.0 tgs: 60 data_time: 0.85s time: 521.27s eta: 3 days, 11:58:57
|
| 284 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 32.6GB text_tokens: 31608.0 tgs: 60 data_time: 0.71s time: 524.11s eta: 3 days, 12:17:38
|
| 286 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.1GB text_tokens: 32396.0 tgs: 62 data_time: 0.84s time: 520.52s eta: 3 days, 11:34:23
|
| 288 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.286 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32445.0 tgs: 62 data_time: 0.76s time: 518.89s eta: 3 days, 11:09:59
|
| 290 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.1GB text_tokens: 32483.0 tgs: 62 data_time: 0.85s time: 522.09s eta: 3 days, 11:32:02
|
| 292 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.8GB text_tokens: 30947.0 tgs: 59 data_time: 0.87s time: 523.81s eta: 3 days, 11:39:53
|
| 294 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 33.0GB text_tokens: 32390.0 tgs: 62 data_time: 0.75s time: 520.47s eta: 3 days, 10:59:09
|
| 296 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 32.9GB text_tokens: 31866.0 tgs: 61 data_time: 0.85s time: 518.42s eta: 3 days, 10:30:52
|
| 298 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 32.9GB text_tokens: 31128.0 tgs: 59 data_time: 0.81s time: 522.82s eta: 3 days, 11:04:14
|
| 300 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.0GB text_tokens: 32297.0 tgs: 61 data_time: 0.67s time: 523.52s eta: 3 days, 11:02:11
|
| 302 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.305 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32495.0 tgs: 62 data_time: 1.01s time: 521.01s eta: 3 days, 10:29:36
|
| 304 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.9GB text_tokens: 31533.0 tgs: 60 data_time: 0.70s time: 519.08s eta: 3 days, 10:02:38
|
| 306 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 32.4GB text_tokens: 31074.0 tgs: 59 data_time: 0.73s time: 523.28s eta: 3 days, 10:33:41
|
| 308 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.332 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32081.0 tgs: 61 data_time: 0.82s time: 522.97s eta: 3 days, 10:22:02
|
| 310 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 31457.0 tgs: 60 data_time: 0.78s time: 521.30s eta: 3 days, 9:57:35
|
| 312 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.235 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31099.0 tgs: 59 data_time: 0.69s time: 520.62s eta: 3 days, 9:42:30
|
| 314 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 33.0GB text_tokens: 32202.0 tgs: 61 data_time: 0.73s time: 522.42s eta: 3 days, 9:50:44
|
| 316 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 32.7GB text_tokens: 30996.0 tgs: 59 data_time: 0.69s time: 523.77s eta: 3 days, 9:54:43
|
| 318 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.0GB text_tokens: 31401.0 tgs: 60 data_time: 0.93s time: 520.41s eta: 3 days, 9:14:30
|
| 320 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 32.7GB text_tokens: 31506.0 tgs: 60 data_time: 0.83s time: 520.66s eta: 3 days, 9:08:10
|
| 322 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.0GB text_tokens: 31849.0 tgs: 61 data_time: 0.79s time: 521.32s eta: 3 days, 9:05:39
|
| 324 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 33.0GB text_tokens: 31503.0 tgs: 60 data_time: 0.63s time: 524.09s eta: 3 days, 9:22:48
|
| 326 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.326 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 32241.0 tgs: 61 data_time: 0.75s time: 520.94s eta: 3 days, 8:44:46
|
| 328 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 31502.0 tgs: 60 data_time: 0.83s time: 520.21s eta: 3 days, 8:29:18
|
| 330 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.8GB text_tokens: 29886.0 tgs: 57 data_time: 0.92s time: 522.55s eta: 3 days, 8:42:18
|
| 332 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 33.1GB text_tokens: 32309.0 tgs: 61 data_time: 0.74s time: 524.32s eta: 3 days, 8:49:55
|
| 334 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 32.8GB text_tokens: 31388.0 tgs: 60 data_time: 0.84s time: 519.87s eta: 3 days, 8:00:07
|
| 336 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.362 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 31748.0 tgs: 61 data_time: 0.78s time: 520.12s eta: 3 days, 7:53:45
|
| 338 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 33.1GB text_tokens: 31342.0 tgs: 60 data_time: 0.67s time: 522.00s eta: 3 days, 8:02:23
|
| 340 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 33.0GB text_tokens: 31840.0 tgs: 60 data_time: 0.84s time: 524.34s eta: 3 days, 8:15:09
|
| 342 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31789.0 tgs: 61 data_time: 0.64s time: 520.56s eta: 3 days, 7:31:46
|
| 344 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 33.0GB text_tokens: 31941.0 tgs: 61 data_time: 0.79s time: 518.55s eta: 3 days, 7:04:43
|
| 346 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 32.2GB text_tokens: 30569.0 tgs: 58 data_time: 0.73s time: 523.14s eta: 3 days, 7:37:59
|
| 348 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.228 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 33.1GB text_tokens: 31307.0 tgs: 59 data_time: 0.81s time: 523.61s eta: 3 days, 7:33:33
|
| 350 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.6GB text_tokens: 31568.0 tgs: 60 data_time: 0.76s time: 520.20s eta: 3 days, 6:53:51
|
| 352 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.361 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.1GB text_tokens: 31707.0 tgs: 60 data_time: 1.05s time: 520.04s eta: 3 days, 6:43:40
|
| 354 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 33.1GB text_tokens: 31866.0 tgs: 60 data_time: 0.87s time: 523.74s eta: 3 days, 7:08:35
|
| 356 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.1GB text_tokens: 31584.0 tgs: 60 data_time: 0.88s time: 523.83s eta: 3 days, 7:00:37
|
| 358 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.0GB text_tokens: 31870.0 tgs: 61 data_time: 0.93s time: 520.22s eta: 3 days, 6:19:20
|
| 360 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.247 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.8GB text_tokens: 30551.0 tgs: 58 data_time: 0.59s time: 518.76s eta: 3 days, 5:57:29
|
| 362 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 33.1GB text_tokens: 31961.0 tgs: 61 data_time: 0.79s time: 522.67s eta: 3 days, 6:24:01
|
| 364 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.1GB text_tokens: 31763.0 tgs: 60 data_time: 0.97s time: 522.77s eta: 3 days, 6:16:11
|
| 366 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.277 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.1GB text_tokens: 31917.0 tgs: 61 data_time: 1.02s time: 520.51s eta: 3 days, 5:47:14
|
| 368 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:26:33][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:26:33][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.252 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 33.1GB text_tokens: 31804.0 tgs: 61 data_time: 0.69s time: 519.64s eta: 3 days, 5:30:47
|
| 370 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.256 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.1GB text_tokens: 32278.0 tgs: 61 data_time: 0.97s time: 521.34s eta: 3 days, 5:37:15
|
| 372 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 32.9GB text_tokens: 31927.0 tgs: 60 data_time: 0.77s time: 523.57s eta: 3 days, 5:48:31
|
| 374 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31369.0 tgs: 60 data_time: 0.93s time: 519.29s eta: 3 days, 5:01:41
|
| 376 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.0GB text_tokens: 32214.0 tgs: 61 data_time: 0.79s time: 520.67s eta: 3 days, 5:05:15
|
| 378 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.261 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 33.0GB text_tokens: 31816.0 tgs: 61 data_time: 0.89s time: 521.47s eta: 3 days, 5:03:41
|
| 380 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 32.9GB text_tokens: 31958.0 tgs: 60 data_time: 0.60s time: 523.93s eta: 3 days, 5:16:46
|
| 382 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.323 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.1GB text_tokens: 31545.0 tgs: 60 data_time: 0.98s time: 518.84s eta: 3 days, 4:23:02
|
| 384 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.1GB text_tokens: 31565.0 tgs: 60 data_time: 0.74s time: 519.27s eta: 3 days, 4:18:15
|
| 386 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 33.0GB text_tokens: 31753.0 tgs: 60 data_time: 0.68s time: 520.70s eta: 3 days, 4:22:08
|
| 388 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.290 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 32.5GB text_tokens: 30827.0 tgs: 58 data_time: 0.71s time: 524.24s eta: 3 days, 4:44:32
|
| 390 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 32.8GB text_tokens: 31476.0 tgs: 60 data_time: 1.04s time: 520.34s eta: 3 days, 4:01:38
|
| 392 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 33.1GB text_tokens: 31389.0 tgs: 60 data_time: 0.42s time: 519.52s eta: 3 days, 3:45:47
|
| 394 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 7][DP 1][SP 3][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 32.4GB text_tokens: 30909.0 tgs: 59 data_time: 0.88s time: 522.30s eta: 3 days, 4:01:26
|
20250120235238/rank8.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.13s
|
| 12 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:07:56][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.83 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31696.0 tgs: 57 data_time: 1.91s time: 547.89s eta: 3 days, 18:14:58
|
| 258 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.262 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31732.0 tgs: 60 data_time: 1.04s time: 523.24s eta: 3 days, 14:02:38
|
| 260 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32529.0 tgs: 62 data_time: 0.90s time: 522.88s eta: 3 days, 13:50:21
|
| 262 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31543.0 tgs: 60 data_time: 0.87s time: 520.29s eta: 3 days, 13:16:10
|
| 264 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31681.0 tgs: 60 data_time: 0.97s time: 520.98s eta: 3 days, 13:14:18
|
| 266 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.244 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.4GB text_tokens: 30335.0 tgs: 58 data_time: 0.83s time: 520.90s eta: 3 days, 13:04:46
|
| 268 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.214 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.8GB text_tokens: 30779.0 tgs: 58 data_time: 0.69s time: 523.39s eta: 3 days, 13:20:27
|
| 270 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.436 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31645.0 tgs: 60 data_time: 0.92s time: 520.66s eta: 3 days, 12:45:06
|
| 272 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.255 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32125.0 tgs: 61 data_time: 0.68s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31629.0 tgs: 60 data_time: 0.92s time: 520.42s eta: 3 days, 12:25:22
|
| 276 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.281 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.9GB text_tokens: 31480.0 tgs: 60 data_time: 0.93s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.284 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.73s time: 520.65s eta: 3 days, 12:10:16
|
| 280 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.339 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.7GB text_tokens: 31315.0 tgs: 60 data_time: 1.22s time: 519.93s eta: 3 days, 11:54:40
|
| 282 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.272 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32182.0 tgs: 61 data_time: 0.65s time: 521.29s eta: 3 days, 11:59:06
|
| 284 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31862.0 tgs: 60 data_time: 0.74s time: 524.12s eta: 3 days, 12:17:48
|
| 286 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.334 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 31743.0 tgs: 60 data_time: 0.90s time: 520.54s eta: 3 days, 11:34:30
|
| 288 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32304.0 tgs: 62 data_time: 0.86s time: 518.82s eta: 3 days, 11:09:21
|
| 290 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 0.88s time: 522.11s eta: 3 days, 11:32:13
|
| 292 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.250 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.6GB text_tokens: 30695.0 tgs: 58 data_time: 0.88s time: 523.83s eta: 3 days, 11:40:02
|
| 294 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.2GB text_tokens: 30721.0 tgs: 59 data_time: 0.98s time: 520.46s eta: 3 days, 10:59:03
|
| 296 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.241 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 29162.0 tgs: 56 data_time: 0.60s time: 518.43s eta: 3 days, 10:31:01
|
| 298 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 31808.0 tgs: 60 data_time: 0.83s time: 522.84s eta: 3 days, 11:04:23
|
| 300 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.325 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32435.0 tgs: 61 data_time: 1.02s time: 523.54s eta: 3 days, 11:02:20
|
| 302 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.308 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32461.0 tgs: 62 data_time: 0.79s time: 520.97s eta: 3 days, 10:29:10
|
| 304 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.264 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.8GB text_tokens: 31514.0 tgs: 60 data_time: 1.00s time: 519.10s eta: 3 days, 10:02:50
|
| 306 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 30176.0 tgs: 57 data_time: 0.58s time: 523.29s eta: 3 days, 10:33:47
|
| 308 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32145.0 tgs: 61 data_time: 1.00s time: 522.93s eta: 3 days, 10:21:42
|
| 310 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32029.0 tgs: 61 data_time: 0.57s time: 521.32s eta: 3 days, 9:57:44
|
| 312 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.349 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31383.0 tgs: 60 data_time: 0.64s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.4GB text_tokens: 30895.0 tgs: 59 data_time: 0.77s time: 522.41s eta: 3 days, 9:50:39
|
| 316 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.317 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31598.0 tgs: 60 data_time: 1.05s time: 523.75s eta: 3 days, 9:54:32
|
| 318 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.369 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32406.0 tgs: 62 data_time: 0.78s time: 520.43s eta: 3 days, 9:14:40
|
| 320 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.313 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 32322.0 tgs: 62 data_time: 0.84s time: 520.68s eta: 3 days, 9:08:19
|
| 322 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.361 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 30969.0 tgs: 59 data_time: 0.83s time: 521.30s eta: 3 days, 9:05:28
|
| 324 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.220 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.8GB text_tokens: 31334.0 tgs: 59 data_time: 0.90s time: 524.11s eta: 3 days, 9:22:56
|
| 326 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31952.0 tgs: 61 data_time: 0.84s time: 520.96s eta: 3 days, 8:44:54
|
| 328 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 32127.0 tgs: 61 data_time: 0.92s time: 520.15s eta: 3 days, 8:28:43
|
| 330 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.265 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.9GB text_tokens: 31858.0 tgs: 60 data_time: 0.83s time: 522.56s eta: 3 days, 8:42:24
|
| 332 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 31386.0 tgs: 59 data_time: 0.85s time: 524.33s eta: 3 days, 8:50:03
|
| 334 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 32250.0 tgs: 62 data_time: 0.71s time: 519.88s eta: 3 days, 8:00:15
|
| 336 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.340 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32507.0 tgs: 62 data_time: 1.07s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31757.0 tgs: 60 data_time: 0.68s time: 522.00s eta: 3 days, 8:02:22
|
| 340 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.8GB text_tokens: 31888.0 tgs: 60 data_time: 0.98s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31891.0 tgs: 61 data_time: 0.81s time: 520.50s eta: 3 days, 7:31:17
|
| 344 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.245 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31410.0 tgs: 60 data_time: 0.68s time: 518.56s eta: 3 days, 7:04:51
|
| 346 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.238 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31763.0 tgs: 60 data_time: 0.96s time: 523.15s eta: 3 days, 7:38:07
|
| 348 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.234 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 30226.0 tgs: 57 data_time: 0.90s time: 523.62s eta: 3 days, 7:33:41
|
| 350 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.316 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31914.0 tgs: 61 data_time: 0.85s time: 520.18s eta: 3 days, 6:53:35
|
| 352 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.259 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 31209.0 tgs: 60 data_time: 0.49s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.273 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 31712.0 tgs: 60 data_time: 0.75s time: 523.76s eta: 3 days, 7:08:45
|
| 356 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31541.0 tgs: 60 data_time: 0.80s time: 523.78s eta: 3 days, 7:00:11
|
| 358 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.350 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32580.0 tgs: 62 data_time: 0.80s time: 520.24s eta: 3 days, 6:19:29
|
| 360 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.211 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.7GB text_tokens: 31793.0 tgs: 61 data_time: 0.57s time: 518.77s eta: 3 days, 5:57:36
|
| 362 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.303 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.8GB text_tokens: 31726.0 tgs: 60 data_time: 0.66s time: 522.69s eta: 3 days, 6:24:10
|
| 364 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 32291.0 tgs: 61 data_time: 0.87s time: 522.73s eta: 3 days, 6:15:51
|
| 366 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.216 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31197.0 tgs: 59 data_time: 0.78s time: 520.53s eta: 3 days, 5:47:23
|
| 368 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.271 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.7GB text_tokens: 31650.0 tgs: 60 data_time: 0.90s time: 519.66s eta: 3 days, 5:30:55
|
| 370 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.0GB text_tokens: 31693.0 tgs: 60 data_time: 0.79s time: 521.29s eta: 3 days, 5:36:53
|
| 372 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.362 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 30935.0 tgs: 59 data_time: 0.82s time: 523.59s eta: 3 days, 5:48:40
|
| 374 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.257 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31373.0 tgs: 60 data_time: 0.75s time: 519.32s eta: 3 days, 5:01:58
|
| 376 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.354 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31466.0 tgs: 60 data_time: 0.73s time: 520.65s eta: 3 days, 5:05:06
|
| 378 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31582.0 tgs: 60 data_time: 0.76s time: 521.45s eta: 3 days, 5:03:33
|
| 380 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.292 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31379.0 tgs: 59 data_time: 0.78s time: 523.94s eta: 3 days, 5:16:53
|
| 382 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.258 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 31834.0 tgs: 61 data_time: 0.50s time: 518.85s eta: 3 days, 4:23:10
|
| 384 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.296 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31863.0 tgs: 61 data_time: 0.65s time: 519.24s eta: 3 days, 4:17:57
|
| 386 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.278 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31055.0 tgs: 59 data_time: 0.67s time: 520.71s eta: 3 days, 4:22:16
|
| 388 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31953.0 tgs: 60 data_time: 0.76s time: 524.25s eta: 3 days, 4:44:41
|
| 390 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.225 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32157.0 tgs: 61 data_time: 0.60s time: 520.33s eta: 3 days, 4:01:32
|
| 392 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 32.7GB text_tokens: 31301.0 tgs: 60 data_time: 0.74s time: 519.50s eta: 3 days, 3:45:34
|
| 394 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 8][DP 2][SP 0][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31423.0 tgs: 60 data_time: 0.47s time: 522.32s eta: 3 days, 4:01:33
|
20250120235238/rank9.log
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:52:42][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250120235238', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:52:42][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:53:37][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:54:31][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:55:25][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:56:18][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:57:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:58:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-20 23:59:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:00:01][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:00:05][INFO] [Dataset & Dataloader] Cost 443.14s
|
| 12 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:07:55][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:10:23][SUCCESS] [Parallelize LLM] Elapsed time 147.88 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:10:24][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:19:46][WARNING] [Step 0] The grad norm is NaN or Inf, skip this step. Skipped 1 steps in total.
|
| 257 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:19:46][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 1 max_memory: 33.1GB text_tokens: 31696.0 tgs: 57 data_time: 1.90s time: 547.75s eta: 3 days, 18:13:35
|
| 258 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:28:29][WARNING] [Step 1] The grad norm is NaN or Inf, skip this step. Skipped 2 steps in total.
|
| 259 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:28:29][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.285 loss(reduced): nan grad_norm: nan if_nan_skip: 2 max_memory: 33.1GB text_tokens: 31732.0 tgs: 60 data_time: 1.06s time: 523.21s eta: 3 days, 14:02:20
|
| 260 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:37:12][WARNING] [Step 2] The grad norm is NaN or Inf, skip this step. Skipped 3 steps in total.
|
| 261 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:37:12][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.217 loss(reduced): nan grad_norm: nan if_nan_skip: 3 max_memory: 33.1GB text_tokens: 32529.0 tgs: 62 data_time: 0.89s time: 522.88s eta: 3 days, 13:50:21
|
| 262 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:45:52][WARNING] [Step 3] The grad norm is NaN or Inf, skip this step. Skipped 4 steps in total.
|
| 263 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:45:52][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.302 loss(reduced): nan grad_norm: nan if_nan_skip: 4 max_memory: 32.9GB text_tokens: 31543.0 tgs: 60 data_time: 0.86s time: 520.29s eta: 3 days, 13:16:11
|
| 264 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:54:33][WARNING] [Step 4] The grad norm is NaN or Inf, skip this step. Skipped 5 steps in total.
|
| 265 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 00:54:33][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.304 loss(reduced): nan grad_norm: nan if_nan_skip: 5 max_memory: 33.0GB text_tokens: 31681.0 tgs: 60 data_time: 0.94s time: 520.98s eta: 3 days, 13:14:18
|
| 266 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:03:14][WARNING] [Step 5] The grad norm is NaN or Inf, skip this step. Skipped 6 steps in total.
|
| 267 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:03:14][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 6 max_memory: 32.4GB text_tokens: 30335.0 tgs: 58 data_time: 0.82s time: 520.89s eta: 3 days, 13:04:45
|
| 268 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:11:58][WARNING] [Step 6] The grad norm is NaN or Inf, skip this step. Skipped 7 steps in total.
|
| 269 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:11:58][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.198 loss(reduced): nan grad_norm: nan if_nan_skip: 7 max_memory: 32.8GB text_tokens: 30779.0 tgs: 58 data_time: 0.69s time: 523.38s eta: 3 days, 13:20:25
|
| 270 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:20:38][WARNING] [Step 7] The grad norm is NaN or Inf, skip this step. Skipped 8 steps in total.
|
| 271 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:20:38][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.231 loss(reduced): nan grad_norm: nan if_nan_skip: 8 max_memory: 33.1GB text_tokens: 31645.0 tgs: 60 data_time: 0.91s time: 520.66s eta: 3 days, 12:45:07
|
| 272 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:29:18][WARNING] [Step 8] The grad norm is NaN or Inf, skip this step. Skipped 9 steps in total.
|
| 273 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:29:18][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 9 max_memory: 33.1GB text_tokens: 32125.0 tgs: 61 data_time: 0.67s time: 520.18s eta: 3 days, 12:31:44
|
| 274 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:37:59][WARNING] [Step 9] The grad norm is NaN or Inf, skip this step. Skipped 10 steps in total.
|
| 275 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:37:59][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.298 loss(reduced): nan grad_norm: nan if_nan_skip: 10 max_memory: 32.7GB text_tokens: 31629.0 tgs: 60 data_time: 0.91s time: 520.41s eta: 3 days, 12:25:21
|
| 276 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:46:43][WARNING] [Step 10] The grad norm is NaN or Inf, skip this step. Skipped 11 steps in total.
|
| 277 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:46:43][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.306 loss(reduced): nan grad_norm: nan if_nan_skip: 11 max_memory: 32.9GB text_tokens: 31480.0 tgs: 60 data_time: 0.97s time: 524.53s eta: 3 days, 12:56:42
|
| 278 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:55:24][WARNING] [Step 11] The grad norm is NaN or Inf, skip this step. Skipped 12 steps in total.
|
| 279 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 01:55:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.327 loss(reduced): nan grad_norm: nan if_nan_skip: 12 max_memory: 33.0GB text_tokens: 32172.0 tgs: 61 data_time: 0.71s time: 520.65s eta: 3 days, 12:10:17
|
| 280 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:04:04][WARNING] [Step 12] The grad norm is NaN or Inf, skip this step. Skipped 13 steps in total.
|
| 281 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:04:04][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.297 loss(reduced): nan grad_norm: nan if_nan_skip: 13 max_memory: 32.7GB text_tokens: 31315.0 tgs: 60 data_time: 1.22s time: 519.93s eta: 3 days, 11:54:39
|
| 282 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:12:45][WARNING] [Step 13] The grad norm is NaN or Inf, skip this step. Skipped 14 steps in total.
|
| 283 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:12:45][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 14 max_memory: 33.1GB text_tokens: 32182.0 tgs: 61 data_time: 0.64s time: 521.29s eta: 3 days, 11:59:06
|
| 284 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:21:29][WARNING] [Step 14] The grad norm is NaN or Inf, skip this step. Skipped 15 steps in total.
|
| 285 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:21:29][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.336 loss(reduced): nan grad_norm: nan if_nan_skip: 15 max_memory: 33.1GB text_tokens: 31862.0 tgs: 60 data_time: 0.75s time: 524.12s eta: 3 days, 12:17:47
|
| 286 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:30:10][WARNING] [Step 15] The grad norm is NaN or Inf, skip this step. Skipped 16 steps in total.
|
| 287 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:30:10][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 16 max_memory: 33.0GB text_tokens: 31743.0 tgs: 60 data_time: 0.89s time: 520.54s eta: 3 days, 11:34:31
|
| 288 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:38:49][WARNING] [Step 16] The grad norm is NaN or Inf, skip this step. Skipped 17 steps in total.
|
| 289 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:38:49][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 17 max_memory: 33.1GB text_tokens: 32304.0 tgs: 62 data_time: 0.86s time: 518.82s eta: 3 days, 11:09:21
|
| 290 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:47:31][WARNING] [Step 17] The grad norm is NaN or Inf, skip this step. Skipped 18 steps in total.
|
| 291 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:47:31][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 18 max_memory: 33.0GB text_tokens: 31895.0 tgs: 61 data_time: 0.86s time: 522.11s eta: 3 days, 11:32:14
|
| 292 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:56:15][WARNING] [Step 18] The grad norm is NaN or Inf, skip this step. Skipped 19 steps in total.
|
| 293 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 02:56:15][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 19 max_memory: 32.6GB text_tokens: 30695.0 tgs: 58 data_time: 0.87s time: 523.83s eta: 3 days, 11:39:59
|
| 294 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:04:55][WARNING] [Step 19] The grad norm is NaN or Inf, skip this step. Skipped 20 steps in total.
|
| 295 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:04:55][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 20 max_memory: 32.2GB text_tokens: 30721.0 tgs: 59 data_time: 0.97s time: 520.46s eta: 3 days, 10:59:03
|
| 296 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:13:34][WARNING] [Step 20] The grad norm is NaN or Inf, skip this step. Skipped 21 steps in total.
|
| 297 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:13:34][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.229 loss(reduced): nan grad_norm: nan if_nan_skip: 21 max_memory: 33.0GB text_tokens: 29162.0 tgs: 56 data_time: 0.59s time: 518.43s eta: 3 days, 10:31:01
|
| 298 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:22:16][WARNING] [Step 21] The grad norm is NaN or Inf, skip this step. Skipped 22 steps in total.
|
| 299 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:22:16][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.315 loss(reduced): nan grad_norm: nan if_nan_skip: 22 max_memory: 33.1GB text_tokens: 31808.0 tgs: 60 data_time: 0.80s time: 522.84s eta: 3 days, 11:04:23
|
| 300 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:31:00][WARNING] [Step 22] The grad norm is NaN or Inf, skip this step. Skipped 23 steps in total.
|
| 301 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:31:00][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 23 max_memory: 33.1GB text_tokens: 32435.0 tgs: 61 data_time: 1.01s time: 523.54s eta: 3 days, 11:02:21
|
| 302 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:39:41][WARNING] [Step 23] The grad norm is NaN or Inf, skip this step. Skipped 24 steps in total.
|
| 303 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:39:41][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.269 loss(reduced): nan grad_norm: nan if_nan_skip: 24 max_memory: 33.1GB text_tokens: 32461.0 tgs: 62 data_time: 0.79s time: 520.96s eta: 3 days, 10:29:07
|
| 304 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:48:20][WARNING] [Step 24] The grad norm is NaN or Inf, skip this step. Skipped 25 steps in total.
|
| 305 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:48:20][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.266 loss(reduced): nan grad_norm: nan if_nan_skip: 25 max_memory: 32.8GB text_tokens: 31514.0 tgs: 60 data_time: 0.98s time: 519.10s eta: 3 days, 10:02:48
|
| 306 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:57:03][WARNING] [Step 25] The grad norm is NaN or Inf, skip this step. Skipped 26 steps in total.
|
| 307 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 03:57:03][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.253 loss(reduced): nan grad_norm: nan if_nan_skip: 26 max_memory: 33.0GB text_tokens: 30176.0 tgs: 57 data_time: 0.56s time: 523.29s eta: 3 days, 10:33:49
|
| 308 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:05:46][WARNING] [Step 26] The grad norm is NaN or Inf, skip this step. Skipped 27 steps in total.
|
| 309 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:05:46][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 27 max_memory: 33.1GB text_tokens: 32145.0 tgs: 61 data_time: 0.99s time: 522.93s eta: 3 days, 10:21:42
|
| 310 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:14:28][WARNING] [Step 27] The grad norm is NaN or Inf, skip this step. Skipped 28 steps in total.
|
| 311 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:14:28][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 28 max_memory: 33.1GB text_tokens: 32029.0 tgs: 61 data_time: 0.58s time: 521.31s eta: 3 days, 9:57:43
|
| 312 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:23:08][WARNING] [Step 28] The grad norm is NaN or Inf, skip this step. Skipped 29 steps in total.
|
| 313 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:23:08][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 29 max_memory: 32.9GB text_tokens: 31383.0 tgs: 60 data_time: 0.64s time: 520.64s eta: 3 days, 9:42:39
|
| 314 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:31:51][WARNING] [Step 29] The grad norm is NaN or Inf, skip this step. Skipped 30 steps in total.
|
| 315 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:31:51][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.295 loss(reduced): nan grad_norm: nan if_nan_skip: 30 max_memory: 32.4GB text_tokens: 30895.0 tgs: 59 data_time: 0.77s time: 522.41s eta: 3 days, 9:50:38
|
| 316 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:40:34][WARNING] [Step 30] The grad norm is NaN or Inf, skip this step. Skipped 31 steps in total.
|
| 317 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:40:34][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.294 loss(reduced): nan grad_norm: nan if_nan_skip: 31 max_memory: 33.0GB text_tokens: 31598.0 tgs: 60 data_time: 1.05s time: 523.75s eta: 3 days, 9:54:31
|
| 318 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:49:15][WARNING] [Step 31] The grad norm is NaN or Inf, skip this step. Skipped 32 steps in total.
|
| 319 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:49:15][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.239 loss(reduced): nan grad_norm: nan if_nan_skip: 32 max_memory: 33.1GB text_tokens: 32406.0 tgs: 62 data_time: 0.78s time: 520.43s eta: 3 days, 9:14:39
|
| 320 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:57:55][WARNING] [Step 32] The grad norm is NaN or Inf, skip this step. Skipped 33 steps in total.
|
| 321 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 04:57:55][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 33 max_memory: 33.0GB text_tokens: 32322.0 tgs: 62 data_time: 0.83s time: 520.68s eta: 3 days, 9:08:18
|
| 322 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:06:37][WARNING] [Step 33] The grad norm is NaN or Inf, skip this step. Skipped 34 steps in total.
|
| 323 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:06:37][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 34 max_memory: 33.1GB text_tokens: 30969.0 tgs: 59 data_time: 0.86s time: 521.30s eta: 3 days, 9:05:27
|
| 324 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:15:21][WARNING] [Step 34] The grad norm is NaN or Inf, skip this step. Skipped 35 steps in total.
|
| 325 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:15:21][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 35 max_memory: 32.8GB text_tokens: 31334.0 tgs: 59 data_time: 0.88s time: 524.11s eta: 3 days, 9:22:57
|
| 326 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:24:02][WARNING] [Step 35] The grad norm is NaN or Inf, skip this step. Skipped 36 steps in total.
|
| 327 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:24:02][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.226 loss(reduced): nan grad_norm: nan if_nan_skip: 36 max_memory: 33.0GB text_tokens: 31952.0 tgs: 61 data_time: 0.82s time: 520.96s eta: 3 days, 8:44:54
|
| 328 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:32:42][WARNING] [Step 36] The grad norm is NaN or Inf, skip this step. Skipped 37 steps in total.
|
| 329 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:32:42][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.224 loss(reduced): nan grad_norm: nan if_nan_skip: 37 max_memory: 33.0GB text_tokens: 32127.0 tgs: 61 data_time: 0.90s time: 520.15s eta: 3 days, 8:28:42
|
| 330 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:41:25][WARNING] [Step 37] The grad norm is NaN or Inf, skip this step. Skipped 38 steps in total.
|
| 331 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:41:25][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.227 loss(reduced): nan grad_norm: nan if_nan_skip: 38 max_memory: 32.9GB text_tokens: 31858.0 tgs: 60 data_time: 0.83s time: 522.56s eta: 3 days, 8:42:24
|
| 332 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:50:09][WARNING] [Step 38] The grad norm is NaN or Inf, skip this step. Skipped 39 steps in total.
|
| 333 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:50:09][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.275 loss(reduced): nan grad_norm: nan if_nan_skip: 39 max_memory: 32.8GB text_tokens: 31386.0 tgs: 59 data_time: 0.84s time: 524.33s eta: 3 days, 8:50:03
|
| 334 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:58:49][WARNING] [Step 39] The grad norm is NaN or Inf, skip this step. Skipped 40 steps in total.
|
| 335 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 05:58:49][INFO] [Train] (Epoch 1) Step 40/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 40 max_memory: 33.1GB text_tokens: 32250.0 tgs: 62 data_time: 0.71s time: 519.88s eta: 3 days, 8:00:15
|
| 336 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:07:29][WARNING] [Step 40] The grad norm is NaN or Inf, skip this step. Skipped 41 steps in total.
|
| 337 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:07:29][INFO] [Train] (Epoch 1) Step 41/593 lr: 0.000020 loss: 0.307 loss(reduced): nan grad_norm: nan if_nan_skip: 41 max_memory: 33.1GB text_tokens: 32507.0 tgs: 62 data_time: 1.06s time: 520.11s eta: 3 days, 7:53:41
|
| 338 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:16:11][WARNING] [Step 41] The grad norm is NaN or Inf, skip this step. Skipped 42 steps in total.
|
| 339 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:16:11][INFO] [Train] (Epoch 1) Step 42/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 42 max_memory: 32.9GB text_tokens: 31757.0 tgs: 60 data_time: 0.68s time: 522.00s eta: 3 days, 8:02:21
|
| 340 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:24:55][WARNING] [Step 42] The grad norm is NaN or Inf, skip this step. Skipped 43 steps in total.
|
| 341 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:24:55][INFO] [Train] (Epoch 1) Step 43/593 lr: 0.000020 loss: 0.312 loss(reduced): nan grad_norm: nan if_nan_skip: 43 max_memory: 32.8GB text_tokens: 31888.0 tgs: 60 data_time: 0.97s time: 524.35s eta: 3 days, 8:15:17
|
| 342 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:33:36][WARNING] [Step 43] The grad norm is NaN or Inf, skip this step. Skipped 44 steps in total.
|
| 343 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:33:36][INFO] [Train] (Epoch 1) Step 44/593 lr: 0.000020 loss: 0.209 loss(reduced): nan grad_norm: nan if_nan_skip: 44 max_memory: 33.1GB text_tokens: 31891.0 tgs: 61 data_time: 0.79s time: 520.50s eta: 3 days, 7:31:17
|
| 344 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:42:14][WARNING] [Step 44] The grad norm is NaN or Inf, skip this step. Skipped 45 steps in total.
|
| 345 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:42:14][INFO] [Train] (Epoch 1) Step 45/593 lr: 0.000020 loss: 0.268 loss(reduced): nan grad_norm: nan if_nan_skip: 45 max_memory: 32.9GB text_tokens: 31410.0 tgs: 60 data_time: 0.68s time: 518.56s eta: 3 days, 7:04:51
|
| 346 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:50:57][WARNING] [Step 45] The grad norm is NaN or Inf, skip this step. Skipped 46 steps in total.
|
| 347 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:50:57][INFO] [Train] (Epoch 1) Step 46/593 lr: 0.000020 loss: 0.249 loss(reduced): nan grad_norm: nan if_nan_skip: 46 max_memory: 33.0GB text_tokens: 31763.0 tgs: 60 data_time: 0.95s time: 523.15s eta: 3 days, 7:38:07
|
| 348 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:59:41][WARNING] [Step 46] The grad norm is NaN or Inf, skip this step. Skipped 47 steps in total.
|
| 349 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 06:59:41][INFO] [Train] (Epoch 1) Step 47/593 lr: 0.000020 loss: 0.251 loss(reduced): nan grad_norm: nan if_nan_skip: 47 max_memory: 32.6GB text_tokens: 30226.0 tgs: 57 data_time: 0.90s time: 523.62s eta: 3 days, 7:33:40
|
| 350 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:08:21][WARNING] [Step 47] The grad norm is NaN or Inf, skip this step. Skipped 48 steps in total.
|
| 351 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:08:21][INFO] [Train] (Epoch 1) Step 48/593 lr: 0.000020 loss: 0.254 loss(reduced): nan grad_norm: nan if_nan_skip: 48 max_memory: 32.9GB text_tokens: 31914.0 tgs: 61 data_time: 0.85s time: 520.17s eta: 3 days, 6:53:34
|
| 352 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:17:01][WARNING] [Step 48] The grad norm is NaN or Inf, skip this step. Skipped 49 steps in total.
|
| 353 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:17:01][INFO] [Train] (Epoch 1) Step 49/593 lr: 0.000020 loss: 0.282 loss(reduced): nan grad_norm: nan if_nan_skip: 49 max_memory: 33.0GB text_tokens: 31209.0 tgs: 60 data_time: 0.48s time: 520.06s eta: 3 days, 6:43:50
|
| 354 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:25:45][WARNING] [Step 49] The grad norm is NaN or Inf, skip this step. Skipped 50 steps in total.
|
| 355 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:25:45][INFO] [Train] (Epoch 1) Step 50/593 lr: 0.000020 loss: 0.301 loss(reduced): nan grad_norm: nan if_nan_skip: 50 max_memory: 32.9GB text_tokens: 31712.0 tgs: 60 data_time: 0.75s time: 523.75s eta: 3 days, 7:08:42
|
| 356 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:34:29][WARNING] [Step 50] The grad norm is NaN or Inf, skip this step. Skipped 51 steps in total.
|
| 357 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:34:29][INFO] [Train] (Epoch 1) Step 51/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 51 max_memory: 33.0GB text_tokens: 31541.0 tgs: 60 data_time: 0.79s time: 523.78s eta: 3 days, 7:00:12
|
| 358 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:43:09][WARNING] [Step 51] The grad norm is NaN or Inf, skip this step. Skipped 52 steps in total.
|
| 359 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:43:09][INFO] [Train] (Epoch 1) Step 52/593 lr: 0.000020 loss: 0.263 loss(reduced): nan grad_norm: nan if_nan_skip: 52 max_memory: 33.1GB text_tokens: 32580.0 tgs: 62 data_time: 0.80s time: 520.24s eta: 3 days, 6:19:29
|
| 360 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:51:48][WARNING] [Step 52] The grad norm is NaN or Inf, skip this step. Skipped 53 steps in total.
|
| 361 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 07:51:48][INFO] [Train] (Epoch 1) Step 53/593 lr: 0.000020 loss: 0.283 loss(reduced): nan grad_norm: nan if_nan_skip: 53 max_memory: 32.7GB text_tokens: 31793.0 tgs: 61 data_time: 0.57s time: 518.78s eta: 3 days, 5:57:38
|
| 362 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:00:31][WARNING] [Step 53] The grad norm is NaN or Inf, skip this step. Skipped 54 steps in total.
|
| 363 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:00:31][INFO] [Train] (Epoch 1) Step 54/593 lr: 0.000020 loss: 0.276 loss(reduced): nan grad_norm: nan if_nan_skip: 54 max_memory: 32.8GB text_tokens: 31726.0 tgs: 60 data_time: 0.65s time: 522.68s eta: 3 days, 6:24:08
|
| 364 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:09:13][WARNING] [Step 54] The grad norm is NaN or Inf, skip this step. Skipped 55 steps in total.
|
| 365 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:09:13][INFO] [Train] (Epoch 1) Step 55/593 lr: 0.000020 loss: 0.233 loss(reduced): nan grad_norm: nan if_nan_skip: 55 max_memory: 33.0GB text_tokens: 32291.0 tgs: 61 data_time: 0.86s time: 522.73s eta: 3 days, 6:15:51
|
| 366 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:17:54][WARNING] [Step 55] The grad norm is NaN or Inf, skip this step. Skipped 56 steps in total.
|
| 367 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:17:54][INFO] [Train] (Epoch 1) Step 56/593 lr: 0.000020 loss: 0.267 loss(reduced): nan grad_norm: nan if_nan_skip: 56 max_memory: 33.0GB text_tokens: 31197.0 tgs: 59 data_time: 0.79s time: 520.52s eta: 3 days, 5:47:22
|
| 368 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:26:34][WARNING] [Step 56] The grad norm is NaN or Inf, skip this step. Skipped 57 steps in total.
|
| 369 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:26:34][INFO] [Train] (Epoch 1) Step 57/593 lr: 0.000020 loss: 0.232 loss(reduced): nan grad_norm: nan if_nan_skip: 57 max_memory: 32.7GB text_tokens: 31650.0 tgs: 60 data_time: 0.89s time: 519.66s eta: 3 days, 5:30:55
|
| 370 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:35:15][WARNING] [Step 57] The grad norm is NaN or Inf, skip this step. Skipped 58 steps in total.
|
| 371 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:35:15][INFO] [Train] (Epoch 1) Step 58/593 lr: 0.000020 loss: 0.223 loss(reduced): nan grad_norm: nan if_nan_skip: 58 max_memory: 33.0GB text_tokens: 31693.0 tgs: 60 data_time: 0.78s time: 521.29s eta: 3 days, 5:36:53
|
| 372 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:43:58][WARNING] [Step 58] The grad norm is NaN or Inf, skip this step. Skipped 59 steps in total.
|
| 373 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:43:58][INFO] [Train] (Epoch 1) Step 59/593 lr: 0.000020 loss: 0.260 loss(reduced): nan grad_norm: nan if_nan_skip: 59 max_memory: 33.0GB text_tokens: 30935.0 tgs: 59 data_time: 0.81s time: 523.59s eta: 3 days, 5:48:40
|
| 374 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:52:38][WARNING] [Step 59] The grad norm is NaN or Inf, skip this step. Skipped 60 steps in total.
|
| 375 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 08:52:38][INFO] [Train] (Epoch 1) Step 60/593 lr: 0.000020 loss: 0.287 loss(reduced): nan grad_norm: nan if_nan_skip: 60 max_memory: 32.9GB text_tokens: 31373.0 tgs: 60 data_time: 0.74s time: 519.32s eta: 3 days, 5:01:57
|
| 376 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:01:18][WARNING] [Step 60] The grad norm is NaN or Inf, skip this step. Skipped 61 steps in total.
|
| 377 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:01:18][INFO] [Train] (Epoch 1) Step 61/593 lr: 0.000020 loss: 0.279 loss(reduced): nan grad_norm: nan if_nan_skip: 61 max_memory: 33.1GB text_tokens: 31466.0 tgs: 60 data_time: 0.72s time: 520.65s eta: 3 days, 5:05:07
|
| 378 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:10:00][WARNING] [Step 61] The grad norm is NaN or Inf, skip this step. Skipped 62 steps in total.
|
| 379 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:10:00][INFO] [Train] (Epoch 1) Step 62/593 lr: 0.000020 loss: 0.243 loss(reduced): nan grad_norm: nan if_nan_skip: 62 max_memory: 32.9GB text_tokens: 31582.0 tgs: 60 data_time: 0.75s time: 521.45s eta: 3 days, 5:03:31
|
| 380 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:18:44][WARNING] [Step 62] The grad norm is NaN or Inf, skip this step. Skipped 63 steps in total.
|
| 381 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:18:44][INFO] [Train] (Epoch 1) Step 63/593 lr: 0.000020 loss: 0.289 loss(reduced): nan grad_norm: nan if_nan_skip: 63 max_memory: 33.0GB text_tokens: 31379.0 tgs: 59 data_time: 0.78s time: 523.94s eta: 3 days, 5:16:53
|
| 382 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:27:23][WARNING] [Step 63] The grad norm is NaN or Inf, skip this step. Skipped 64 steps in total.
|
| 383 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:27:23][INFO] [Train] (Epoch 1) Step 64/593 lr: 0.000020 loss: 0.274 loss(reduced): nan grad_norm: nan if_nan_skip: 64 max_memory: 33.0GB text_tokens: 31834.0 tgs: 61 data_time: 0.50s time: 518.85s eta: 3 days, 4:23:12
|
| 384 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:36:02][WARNING] [Step 64] The grad norm is NaN or Inf, skip this step. Skipped 65 steps in total.
|
| 385 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:36:02][INFO] [Train] (Epoch 1) Step 65/593 lr: 0.000020 loss: 0.240 loss(reduced): nan grad_norm: nan if_nan_skip: 65 max_memory: 33.0GB text_tokens: 31863.0 tgs: 61 data_time: 0.63s time: 519.24s eta: 3 days, 4:17:57
|
| 386 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:44:43][WARNING] [Step 65] The grad norm is NaN or Inf, skip this step. Skipped 66 steps in total.
|
| 387 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:44:43][INFO] [Train] (Epoch 1) Step 66/593 lr: 0.000020 loss: 0.280 loss(reduced): nan grad_norm: nan if_nan_skip: 66 max_memory: 32.9GB text_tokens: 31055.0 tgs: 59 data_time: 0.65s time: 520.71s eta: 3 days, 4:22:16
|
| 388 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:53:27][WARNING] [Step 66] The grad norm is NaN or Inf, skip this step. Skipped 67 steps in total.
|
| 389 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 09:53:27][INFO] [Train] (Epoch 1) Step 67/593 lr: 0.000020 loss: 0.309 loss(reduced): nan grad_norm: nan if_nan_skip: 67 max_memory: 33.0GB text_tokens: 31953.0 tgs: 60 data_time: 0.75s time: 524.25s eta: 3 days, 4:44:40
|
| 390 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:02:07][WARNING] [Step 67] The grad norm is NaN or Inf, skip this step. Skipped 68 steps in total.
|
| 391 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:02:07][INFO] [Train] (Epoch 1) Step 68/593 lr: 0.000020 loss: 0.248 loss(reduced): nan grad_norm: nan if_nan_skip: 68 max_memory: 33.1GB text_tokens: 32157.0 tgs: 61 data_time: 0.60s time: 520.33s eta: 3 days, 4:01:33
|
| 392 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:10:47][WARNING] [Step 68] The grad norm is NaN or Inf, skip this step. Skipped 69 steps in total.
|
| 393 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:10:47][INFO] [Train] (Epoch 1) Step 69/593 lr: 0.000020 loss: 0.369 loss(reduced): nan grad_norm: nan if_nan_skip: 69 max_memory: 32.7GB text_tokens: 31301.0 tgs: 60 data_time: 0.74s time: 519.49s eta: 3 days, 3:45:34
|
| 394 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:19:29][WARNING] [Step 69] The grad norm is NaN or Inf, skip this step. Skipped 70 steps in total.
|
| 395 |
+
[XTuner][RANK 9][DP 2][SP 1][TP 0][2025-01-21 10:19:29][INFO] [Train] (Epoch 1) Step 70/593 lr: 0.000020 loss: 0.337 loss(reduced): nan grad_norm: nan if_nan_skip: 70 max_memory: 33.0GB text_tokens: 31423.0 tgs: 60 data_time: 0.47s time: 522.32s eta: 3 days, 4:01:33
|
20250121104251/rank0.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121104251/rank16.log
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:42:55][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250121104251', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:42:56][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:42:58][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:00][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:03][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:04][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:06][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:08][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:09][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:43:18][INFO] [Dataset & Dataloader] Cost 22.87s
|
| 12 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:37][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:46:39][SUCCESS] [Parallelize LLM] Elapsed time 121.32 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:46:40][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 10:56:05][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.264 loss(reduced): 0.273 grad_norm: 2.54 if_nan_skip: 0 max_memory: 33.1GB text_tokens: 31743.0 tgs: 57 data_time: 1.77s time: 550.10s eta: 3 days, 18:36:50
|
| 257 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:04:55][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.253 loss(reduced): 0.257 grad_norm: 2.31 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32110.0 tgs: 60 data_time: 1.00s time: 529.49s eta: 3 days, 15:04:16
|
| 258 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:13:44][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.208 loss(reduced): 0.236 grad_norm: 1.21 if_nan_skip: 0 max_memory: 41.1GB text_tokens: 31536.0 tgs: 59 data_time: 0.87s time: 529.10s eta: 3 days, 14:51:39
|
| 259 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:22:34][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.249 loss(reduced): 0.212 grad_norm: 0.33 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31549.0 tgs: 59 data_time: 1.18s time: 529.96s eta: 3 days, 14:51:19
|
| 260 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:31:23][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.230 loss(reduced): 0.214 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31125.0 tgs: 58 data_time: 0.89s time: 529.04s eta: 3 days, 14:33:22
|
| 261 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:40:13][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.289 loss(reduced): 0.221 grad_norm: 0.44 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31488.0 tgs: 59 data_time: 1.12s time: 529.87s eta: 3 days, 14:32:40
|
| 262 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:49:03][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.237 loss(reduced): 0.205 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 30800.0 tgs: 58 data_time: 0.81s time: 529.64s eta: 3 days, 14:21:36
|
| 263 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 11:57:58][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.229 loss(reduced): 0.232 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32301.0 tgs: 60 data_time: 0.99s time: 535.78s eta: 3 days, 15:12:48
|
| 264 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:06:52][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.227 loss(reduced): 0.215 grad_norm: 0.29 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31885.0 tgs: 59 data_time: 1.03s time: 534.00s eta: 3 days, 14:46:30
|
| 265 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:15:46][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.143 loss(reduced): 0.210 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32246.0 tgs: 60 data_time: 0.85s time: 533.29s eta: 3 days, 14:30:43
|
| 266 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:24:35][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.163 loss(reduced): 0.204 grad_norm: 0.26 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32255.0 tgs: 60 data_time: 0.73s time: 529.43s eta: 3 days, 13:44:17
|
| 267 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:33:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.206 loss(reduced): 0.201 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31976.0 tgs: 60 data_time: 0.90s time: 529.01s eta: 3 days, 13:31:25
|
| 268 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:42:18][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.182 loss(reduced): 0.206 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32057.0 tgs: 60 data_time: 0.79s time: 533.81s eta: 3 days, 14:09:06
|
| 269 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:51:07][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.212 loss(reduced): 0.205 grad_norm: 0.20 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31783.0 tgs: 60 data_time: 0.85s time: 528.95s eta: 3 days, 13:13:10
|
| 270 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 12:59:57][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.203 loss(reduced): 0.198 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.1GB text_tokens: 30276.0 tgs: 57 data_time: 0.77s time: 530.40s eta: 3 days, 13:18:24
|
| 271 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:09:01][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.209 loss(reduced): 0.191 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32240.0 tgs: 59 data_time: 0.78s time: 543.34s eta: 3 days, 15:14:10
|
| 272 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:17:57][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.179 loss(reduced): 0.205 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32111.0 tgs: 59 data_time: 0.99s time: 536.35s eta: 3 days, 13:57:55
|
| 273 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:26:48][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.210 loss(reduced): 0.192 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 30872.0 tgs: 58 data_time: 0.84s time: 530.96s eta: 3 days, 12:57:13
|
| 274 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:35:38][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.227 loss(reduced): 0.192 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 31286.0 tgs: 59 data_time: 1.05s time: 529.92s eta: 3 days, 12:38:26
|
| 275 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:44:32][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.162 loss(reduced): 0.198 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31340.0 tgs: 58 data_time: 0.77s time: 534.36s eta: 3 days, 13:11:59
|
| 276 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 13:53:22][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.163 loss(reduced): 0.189 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31187.0 tgs: 58 data_time: 0.92s time: 529.95s eta: 3 days, 12:21:00
|
| 277 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:02:19][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.162 loss(reduced): 0.198 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31884.0 tgs: 59 data_time: 0.90s time: 537.05s eta: 3 days, 13:19:51
|
| 278 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:11:09][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.279 loss(reduced): 0.187 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32216.0 tgs: 60 data_time: 0.88s time: 529.35s eta: 3 days, 11:57:41
|
| 279 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:20:04][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.202 loss(reduced): 0.190 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31832.0 tgs: 59 data_time: 0.63s time: 535.74s eta: 3 days, 12:49:32
|
| 280 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:29:03][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.196 loss(reduced): 0.186 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31974.0 tgs: 59 data_time: 1.01s time: 538.84s eta: 3 days, 13:09:59
|
| 281 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:37:57][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.204 loss(reduced): 0.188 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31429.0 tgs: 58 data_time: 0.81s time: 533.64s eta: 3 days, 12:11:46
|
| 282 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:46:47][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.167 loss(reduced): 0.195 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31130.0 tgs: 58 data_time: 1.02s time: 529.61s eta: 3 days, 11:24:46
|
| 283 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 14:55:38][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.168 loss(reduced): 0.184 grad_norm: 0.13 if_nan_skip: 0 max_memory: 41.1GB text_tokens: 30936.0 tgs: 58 data_time: 0.61s time: 531.67s eta: 3 days, 11:35:23
|
| 284 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:04:28][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.173 loss(reduced): 0.183 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31633.0 tgs: 59 data_time: 0.75s time: 529.50s eta: 3 days, 11:06:07
|
| 285 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:13:17][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.128 loss(reduced): 0.187 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31970.0 tgs: 60 data_time: 1.04s time: 529.51s eta: 3 days, 10:57:26
|
| 286 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:22:07][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.201 loss(reduced): 0.188 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 25656.0 tgs: 48 data_time: 0.86s time: 529.83s eta: 3 days, 10:51:33
|
| 287 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:31:02][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.248 loss(reduced): 0.183 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31702.0 tgs: 59 data_time: 0.95s time: 534.68s eta: 3 days, 11:28:10
|
| 288 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:39:52][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.235 loss(reduced): 0.182 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 31282.0 tgs: 59 data_time: 0.69s time: 529.79s eta: 3 days, 10:33:30
|
| 289 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:48:42][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.135 loss(reduced): 0.182 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31475.0 tgs: 59 data_time: 0.57s time: 530.09s eta: 3 days, 10:27:31
|
| 290 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 15:57:32][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.229 loss(reduced): 0.182 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31004.0 tgs: 58 data_time: 0.91s time: 529.95s eta: 3 days, 10:17:20
|
| 291 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 16:06:25][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.223 loss(reduced): 0.182 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31063.0 tgs: 58 data_time: 0.90s time: 533.29s eta: 3 days, 10:39:38
|
| 292 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 16:15:14][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.149 loss(reduced): 0.177 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31013.0 tgs: 58 data_time: 1.03s time: 529.21s eta: 3 days, 9:52:51
|
| 293 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 16:24:04][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.161 loss(reduced): 0.183 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32430.0 tgs: 61 data_time: 0.80s time: 529.67s eta: 3 days, 9:48:14
|
| 294 |
+
[XTuner][RANK 16][DP 4][SP 0][TP 0][2025-01-21 16:32:54][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.168 loss(reduced): 0.175 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32259.0 tgs: 60 data_time: 0.60s time: 530.39s eta: 3 days, 9:46:04
|
20250121104251/rank30.log
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:42:55][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250121104251', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:42:56][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:00][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:02][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:04][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:06][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:07][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:09][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:13][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:43:18][INFO] [Dataset & Dataloader] Cost 22.84s
|
| 12 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:29][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:44:30][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:46:39][SUCCESS] [Parallelize LLM] Elapsed time 129.39 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:46:40][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 10:56:05][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.315 loss(reduced): 0.273 grad_norm: 2.54 if_nan_skip: 0 max_memory: 33.1GB text_tokens: 32271.0 tgs: 58 data_time: 1.70s time: 551.63s eta: 3 days, 18:51:59
|
| 257 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:04:55][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.236 loss(reduced): 0.257 grad_norm: 2.31 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32224.0 tgs: 60 data_time: 0.97s time: 529.47s eta: 3 days, 15:04:03
|
| 258 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:13:44][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.270 loss(reduced): 0.236 grad_norm: 1.21 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31186.0 tgs: 58 data_time: 1.00s time: 529.10s eta: 3 days, 14:51:38
|
| 259 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:22:34][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.218 loss(reduced): 0.212 grad_norm: 0.33 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32092.0 tgs: 60 data_time: 0.89s time: 529.97s eta: 3 days, 14:51:21
|
| 260 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:31:23][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.215 loss(reduced): 0.214 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 31643.0 tgs: 59 data_time: 0.79s time: 529.03s eta: 3 days, 14:33:18
|
| 261 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:40:13][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.284 loss(reduced): 0.221 grad_norm: 0.44 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31263.0 tgs: 59 data_time: 0.78s time: 529.87s eta: 3 days, 14:32:41
|
| 262 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:49:03][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.158 loss(reduced): 0.205 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 30097.0 tgs: 56 data_time: 0.81s time: 529.64s eta: 3 days, 14:21:40
|
| 263 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 11:57:58][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.306 loss(reduced): 0.232 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 32404.0 tgs: 60 data_time: 0.91s time: 535.78s eta: 3 days, 15:12:48
|
| 264 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:06:52][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.204 loss(reduced): 0.215 grad_norm: 0.29 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 30582.0 tgs: 57 data_time: 0.83s time: 534.00s eta: 3 days, 14:46:27
|
| 265 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:15:46][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.152 loss(reduced): 0.210 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32372.0 tgs: 60 data_time: 0.80s time: 533.29s eta: 3 days, 14:30:41
|
| 266 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:24:35][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.189 loss(reduced): 0.204 grad_norm: 0.26 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32256.0 tgs: 60 data_time: 0.68s time: 529.44s eta: 3 days, 13:44:23
|
| 267 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:33:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.173 loss(reduced): 0.201 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31025.0 tgs: 58 data_time: 0.63s time: 529.01s eta: 3 days, 13:31:24
|
| 268 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:42:18][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.235 loss(reduced): 0.206 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31804.0 tgs: 59 data_time: 0.95s time: 533.82s eta: 3 days, 14:09:07
|
| 269 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:51:07][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.193 loss(reduced): 0.205 grad_norm: 0.20 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32389.0 tgs: 61 data_time: 0.84s time: 528.95s eta: 3 days, 13:13:08
|
| 270 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 12:59:57][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.246 loss(reduced): 0.198 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 30946.0 tgs: 58 data_time: 0.89s time: 530.40s eta: 3 days, 13:18:23
|
| 271 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:09:01][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.211 loss(reduced): 0.191 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31432.0 tgs: 57 data_time: 0.92s time: 543.34s eta: 3 days, 15:14:12
|
| 272 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:17:57][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.260 loss(reduced): 0.205 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32194.0 tgs: 60 data_time: 0.75s time: 536.35s eta: 3 days, 13:57:55
|
| 273 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:26:48][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.287 loss(reduced): 0.192 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31342.0 tgs: 59 data_time: 1.13s time: 530.96s eta: 3 days, 12:57:13
|
| 274 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:35:38][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.212 loss(reduced): 0.192 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32054.0 tgs: 60 data_time: 0.73s time: 529.91s eta: 3 days, 12:38:19
|
| 275 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:44:32][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.216 loss(reduced): 0.198 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32360.0 tgs: 60 data_time: 0.79s time: 534.36s eta: 3 days, 13:12:03
|
| 276 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 13:53:22][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.188 loss(reduced): 0.189 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31497.0 tgs: 59 data_time: 0.65s time: 529.95s eta: 3 days, 12:21:01
|
| 277 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:02:19][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.202 loss(reduced): 0.198 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32299.0 tgs: 60 data_time: 0.84s time: 537.05s eta: 3 days, 13:19:52
|
| 278 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:11:09][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.159 loss(reduced): 0.187 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32054.0 tgs: 60 data_time: 0.69s time: 529.35s eta: 3 days, 11:57:38
|
| 279 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:20:04][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.157 loss(reduced): 0.190 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31131.0 tgs: 58 data_time: 0.67s time: 535.75s eta: 3 days, 12:49:35
|
| 280 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:29:03][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.240 loss(reduced): 0.186 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32103.0 tgs: 59 data_time: 0.80s time: 538.84s eta: 3 days, 13:10:02
|
| 281 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:37:57][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.179 loss(reduced): 0.188 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32152.0 tgs: 60 data_time: 0.70s time: 533.64s eta: 3 days, 12:11:46
|
| 282 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:46:47][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.191 loss(reduced): 0.195 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31336.0 tgs: 59 data_time: 1.00s time: 529.61s eta: 3 days, 11:24:47
|
| 283 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 14:55:38][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.173 loss(reduced): 0.184 grad_norm: 0.13 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31412.0 tgs: 59 data_time: 0.74s time: 531.67s eta: 3 days, 11:35:24
|
| 284 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:04:28][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.215 loss(reduced): 0.183 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32135.0 tgs: 60 data_time: 0.74s time: 529.50s eta: 3 days, 11:06:05
|
| 285 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:13:17][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.237 loss(reduced): 0.187 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31733.0 tgs: 59 data_time: 0.84s time: 529.51s eta: 3 days, 10:57:25
|
| 286 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:22:07][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.129 loss(reduced): 0.188 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31765.0 tgs: 59 data_time: 0.91s time: 529.83s eta: 3 days, 10:51:35
|
| 287 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:31:02][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.221 loss(reduced): 0.183 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31949.0 tgs: 59 data_time: 0.95s time: 534.67s eta: 3 days, 11:28:06
|
| 288 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:39:52][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.179 loss(reduced): 0.182 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31335.0 tgs: 59 data_time: 0.63s time: 529.79s eta: 3 days, 10:33:32
|
| 289 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:48:42][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.211 loss(reduced): 0.182 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31518.0 tgs: 59 data_time: 0.98s time: 530.10s eta: 3 days, 10:27:36
|
| 290 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 15:57:32][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.184 loss(reduced): 0.182 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31987.0 tgs: 60 data_time: 0.95s time: 529.94s eta: 3 days, 10:17:18
|
| 291 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 16:06:25][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.174 loss(reduced): 0.182 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32039.0 tgs: 60 data_time: 1.25s time: 533.30s eta: 3 days, 10:39:39
|
| 292 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 16:15:14][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.152 loss(reduced): 0.177 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32103.0 tgs: 60 data_time: 0.78s time: 529.21s eta: 3 days, 9:52:52
|
| 293 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 16:24:04][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.164 loss(reduced): 0.183 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32352.0 tgs: 61 data_time: 0.74s time: 529.66s eta: 3 days, 9:48:12
|
| 294 |
+
[XTuner][RANK 30][DP 7][SP 2][TP 0][2025-01-21 16:32:54][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.211 loss(reduced): 0.175 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31385.0 tgs: 59 data_time: 1.05s time: 530.38s eta: 3 days, 9:46:02
|
20250121104251/rank45.log
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:42:55][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250121104251', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:42:56][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:42:59][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:02][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:04][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:06][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:08][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:10][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:12][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:43:18][INFO] [Dataset & Dataloader] Cost 22.82s
|
| 12 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:46:39][SUCCESS] [Parallelize LLM] Elapsed time 121.10 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:46:40][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 10:56:05][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.314 loss(reduced): 0.273 grad_norm: 2.54 if_nan_skip: 0 max_memory: 33.0GB text_tokens: 31361.0 tgs: 57 data_time: 1.69s time: 549.97s eta: 3 days, 18:35:32
|
| 257 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:04:55][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.252 loss(reduced): 0.257 grad_norm: 2.31 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32373.0 tgs: 61 data_time: 1.13s time: 529.71s eta: 3 days, 15:06:30
|
| 258 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:13:44][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.257 loss(reduced): 0.236 grad_norm: 1.21 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31271.0 tgs: 59 data_time: 1.13s time: 529.09s eta: 3 days, 14:51:33
|
| 259 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:22:34][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.219 loss(reduced): 0.212 grad_norm: 0.33 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31959.0 tgs: 60 data_time: 0.75s time: 529.97s eta: 3 days, 14:51:19
|
| 260 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:31:23][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.181 loss(reduced): 0.214 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32119.0 tgs: 60 data_time: 0.59s time: 529.02s eta: 3 days, 14:33:15
|
| 261 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:40:13][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.202 loss(reduced): 0.221 grad_norm: 0.44 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31094.0 tgs: 58 data_time: 0.78s time: 529.86s eta: 3 days, 14:32:36
|
| 262 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:49:03][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.217 loss(reduced): 0.205 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31860.0 tgs: 60 data_time: 1.01s time: 529.63s eta: 3 days, 14:21:30
|
| 263 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 11:57:58][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.194 loss(reduced): 0.232 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.0GB text_tokens: 31574.0 tgs: 58 data_time: 0.72s time: 535.78s eta: 3 days, 15:12:45
|
| 264 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:06:52][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.203 loss(reduced): 0.215 grad_norm: 0.29 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31714.0 tgs: 59 data_time: 0.72s time: 534.03s eta: 3 days, 14:46:48
|
| 265 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:15:46][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.287 loss(reduced): 0.210 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31891.0 tgs: 59 data_time: 0.77s time: 533.28s eta: 3 days, 14:30:37
|
| 266 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:24:35][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.200 loss(reduced): 0.204 grad_norm: 0.26 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31327.0 tgs: 59 data_time: 0.83s time: 529.43s eta: 3 days, 13:44:16
|
| 267 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:33:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.297 loss(reduced): 0.201 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 32020.0 tgs: 60 data_time: 0.82s time: 529.02s eta: 3 days, 13:31:31
|
| 268 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:42:18][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.233 loss(reduced): 0.206 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31061.0 tgs: 58 data_time: 0.64s time: 533.81s eta: 3 days, 14:09:03
|
| 269 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:51:07][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.182 loss(reduced): 0.205 grad_norm: 0.20 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 30891.0 tgs: 58 data_time: 0.71s time: 528.94s eta: 3 days, 13:13:04
|
| 270 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 12:59:57][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.186 loss(reduced): 0.198 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31341.0 tgs: 59 data_time: 0.82s time: 530.39s eta: 3 days, 13:18:17
|
| 271 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:09:01][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.204 loss(reduced): 0.191 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31438.0 tgs: 57 data_time: 0.84s time: 543.36s eta: 3 days, 15:14:24
|
| 272 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:17:57][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.222 loss(reduced): 0.205 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32067.0 tgs: 59 data_time: 0.64s time: 536.35s eta: 3 days, 13:57:52
|
| 273 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:26:48][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.214 loss(reduced): 0.192 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32309.0 tgs: 60 data_time: 0.92s time: 530.95s eta: 3 days, 12:57:08
|
| 274 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:35:38][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.200 loss(reduced): 0.192 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31293.0 tgs: 59 data_time: 0.72s time: 529.94s eta: 3 days, 12:38:33
|
| 275 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:44:32][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.165 loss(reduced): 0.198 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 31896.0 tgs: 59 data_time: 0.71s time: 534.35s eta: 3 days, 13:11:55
|
| 276 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 13:53:22][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.132 loss(reduced): 0.189 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31834.0 tgs: 60 data_time: 0.63s time: 529.94s eta: 3 days, 12:20:56
|
| 277 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:02:19][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.139 loss(reduced): 0.198 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32278.0 tgs: 60 data_time: 0.72s time: 537.06s eta: 3 days, 13:20:00
|
| 278 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:11:09][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.207 loss(reduced): 0.187 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31172.0 tgs: 58 data_time: 0.90s time: 529.34s eta: 3 days, 11:57:34
|
| 279 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:20:04][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.213 loss(reduced): 0.190 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32324.0 tgs: 60 data_time: 0.91s time: 535.74s eta: 3 days, 12:49:30
|
| 280 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:29:03][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.188 loss(reduced): 0.186 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32419.0 tgs: 60 data_time: 0.86s time: 538.83s eta: 3 days, 13:09:56
|
| 281 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:37:57][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.158 loss(reduced): 0.188 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31182.0 tgs: 58 data_time: 0.82s time: 533.66s eta: 3 days, 12:12:01
|
| 282 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:46:47][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.218 loss(reduced): 0.195 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31901.0 tgs: 60 data_time: 0.68s time: 529.60s eta: 3 days, 11:24:41
|
| 283 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 14:55:38][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.167 loss(reduced): 0.184 grad_norm: 0.13 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 30544.0 tgs: 57 data_time: 0.95s time: 531.66s eta: 3 days, 11:35:19
|
| 284 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:04:28][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.152 loss(reduced): 0.183 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31830.0 tgs: 60 data_time: 0.72s time: 529.51s eta: 3 days, 11:06:14
|
| 285 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:13:17][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.221 loss(reduced): 0.187 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31792.0 tgs: 60 data_time: 1.11s time: 529.51s eta: 3 days, 10:57:22
|
| 286 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:22:07][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.215 loss(reduced): 0.188 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31968.0 tgs: 60 data_time: 0.82s time: 529.83s eta: 3 days, 10:51:33
|
| 287 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:31:02][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.136 loss(reduced): 0.183 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31133.0 tgs: 58 data_time: 0.83s time: 534.69s eta: 3 days, 11:28:17
|
| 288 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:39:52][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.173 loss(reduced): 0.182 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 32219.0 tgs: 60 data_time: 0.82s time: 529.78s eta: 3 days, 10:33:27
|
| 289 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:48:42][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.202 loss(reduced): 0.182 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31570.0 tgs: 59 data_time: 0.91s time: 530.08s eta: 3 days, 10:27:27
|
| 290 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 15:57:32][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.171 loss(reduced): 0.182 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32328.0 tgs: 61 data_time: 0.95s time: 529.94s eta: 3 days, 10:17:16
|
| 291 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 16:06:25][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.239 loss(reduced): 0.182 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31564.0 tgs: 59 data_time: 0.94s time: 533.31s eta: 3 days, 10:39:45
|
| 292 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 16:15:14][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.172 loss(reduced): 0.177 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31631.0 tgs: 59 data_time: 0.92s time: 529.21s eta: 3 days, 9:52:48
|
| 293 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 16:24:04][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.205 loss(reduced): 0.183 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31761.0 tgs: 59 data_time: 0.73s time: 529.65s eta: 3 days, 9:48:07
|
| 294 |
+
[XTuner][RANK 45][DP 11][SP 1][TP 0][2025-01-21 16:32:54][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.156 loss(reduced): 0.175 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31545.0 tgs: 59 data_time: 0.83s time: 530.41s eta: 3 days, 9:46:16
|
20250121104251/rank47.log
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:42:55][INFO] Namespace(llm='/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842', tokenizer=None, chat_template='qwen2', use_lora=False, lora_targets=None, lora_r=64, lora_alpha=16, lora_dropout=0.1, lora_bias='none', dtype='auto', selective_recompute=1.0, shard_strategy='full', cpu_offload=False, sp_size=4, datasets=['/mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2'], dset_file_types=dict_keys(['.jsonl', '.json']), dset_sources=['local'], dset_formats=['openai'], dset_sample_ratios=[1.0], dset_cache_dir='/mnt/petrelfs/caimengzhang/cached_data/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2', dset_pack_level='soft', global_pack=True, max_length=32768, num_workers=1, file_pattern=None, group_by_length=True, mirco_batch_size=1, global_batch_size=64, lr=2e-05, lr_min=6e-06, wd=0.01, max_grad_norm=1, epochs=1, warmup_ratio=0.025, config=None, work_dir='checkpoints/qwen25_72b_inst_base50v2-new-zh-en30w-combinev9-mls-chatbeta2/20250121104251', feishu_webhook=None, gc_interval=100, checkpoint_interval=200000.0, checkpoint_max_keep=1, checkpoint_drop_optimizer=True, log_interval=1, resume=False, seed=0, debug=False)
|
| 2 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:42:56][INFO] Found 8 files in /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2
|
| 3 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:42:59][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_1.jsonl has 4 prompt length>32768, discard.
|
| 4 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:03][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_2.jsonl has 4 prompt length>32768, discard.
|
| 5 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:05][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_3.jsonl has 5 prompt length>32768, discard.
|
| 6 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:07][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_4.jsonl has 6 prompt length>32768, discard.
|
| 7 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:09][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_5.jsonl has 2 prompt length>32768, discard.
|
| 8 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:11][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_6.jsonl has 4 prompt length>32768, discard.
|
| 9 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:14][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_7.jsonl has 3 prompt length>32768, discard.
|
| 10 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:15][WARNING] /mnt/petrelfs/caimengzhang/data/20b_data//base50v2-new-zh-en30w-combinev9-mls-chatbeta2/data_part_8.jsonl has 1 prompt length>32768, discard.
|
| 11 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:43:18][INFO] [Dataset & Dataloader] Cost 22.82s
|
| 12 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch (Qwen2ForCausalLM) forward to `qwen2_casual_forward`
|
| 13 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 14 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 15 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.0.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 16 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 17 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 18 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.1.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 19 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 20 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 21 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.2.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 22 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 23 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 24 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.3.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 25 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 26 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 27 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.4.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 28 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 29 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 30 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.5.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 31 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 32 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 33 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.6.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 34 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 35 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 36 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.7.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 37 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 38 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 39 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.8.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 40 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 41 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 42 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.9.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 43 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 44 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 45 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.10.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 46 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 47 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 48 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.11.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 49 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 50 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 51 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.12.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 52 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 53 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 54 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.13.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 55 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 56 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 57 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.14.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 58 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 59 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 60 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.15.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 61 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 62 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 63 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.16.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 64 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 65 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 66 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.17.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 67 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 68 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 69 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.18.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 70 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 71 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 72 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.19.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 73 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 74 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 75 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.20.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 76 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 77 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 78 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.21.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 79 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 80 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 81 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.22.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 82 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 83 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 84 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.23.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 85 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 86 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 87 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.24.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 88 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 89 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 90 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.25.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 91 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 92 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 93 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.26.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 94 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 95 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 96 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.27.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 97 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 98 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 99 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.28.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 100 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 101 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 102 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.29.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 103 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 104 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 105 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.30.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 106 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 107 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 108 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.31.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 109 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 110 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 111 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.32.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 112 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 113 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 114 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.33.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 115 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 116 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 117 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.34.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 118 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 119 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 120 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.35.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 121 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 122 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 123 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.36.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 124 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 125 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 126 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.37.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 127 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 128 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 129 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.38.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 130 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 131 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 132 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.39.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 133 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 134 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 135 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.40.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 136 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 137 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 138 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.41.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 139 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 140 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 141 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.42.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 142 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 143 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 144 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.43.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 145 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 146 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 147 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.44.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 148 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 149 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 150 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.45.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 151 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 152 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 153 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.46.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 154 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 155 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 156 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.47.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 157 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 158 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 159 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.48.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 160 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 161 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 162 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.49.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 163 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 164 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 165 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.50.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 166 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 167 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 168 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.51.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 169 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 170 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 171 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.52.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 172 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 173 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 174 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.53.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 175 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 176 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 177 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.54.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 178 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 179 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 180 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.55.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 181 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 182 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 183 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.56.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 184 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 185 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 186 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.57.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 187 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 188 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 189 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.58.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 190 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 191 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 192 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.59.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 193 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 194 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 195 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.60.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 196 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 197 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 198 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.61.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 199 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 200 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 201 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.62.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 202 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 203 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 204 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.63.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 205 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 206 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 207 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.64.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 208 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 209 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 210 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.65.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 211 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 212 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 213 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.66.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 214 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 215 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 216 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.67.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 217 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 218 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 219 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.68.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 220 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 221 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 222 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.69.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 223 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 224 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 225 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.70.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 226 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 227 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 228 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.71.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 229 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 230 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 231 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.72.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 232 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 233 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 234 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.73.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 235 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 236 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 237 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.74.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 238 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 239 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 240 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.75.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 241 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 242 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 243 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.76.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 244 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 245 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 246 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.77.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 247 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 248 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 249 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.78.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 250 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.self_attn(Qwen2FlashAttention2) forward to `qwen2_attn_flash_forward`
|
| 251 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.input_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 252 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.layers.79.post_attention_layernorm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 253 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:44:38][DEBUG] Dispatch model.norm(Qwen2RMSNorm) forward to `rms_norm_forward`
|
| 254 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:46:39][SUCCESS] [Parallelize LLM] Elapsed time 121.14 seconds, peak gpu memory 13.4G
|
| 255 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:46:40][INFO] [Train] Begin Train Loop. The current GPU memory is 4.2GB
|
| 256 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 10:56:05][INFO] [Train] (Epoch 1) Step 1/593 lr: 0.000001 loss: 0.238 loss(reduced): 0.273 grad_norm: 2.54 if_nan_skip: 0 max_memory: 33.0GB text_tokens: 31361.0 tgs: 57 data_time: 1.69s time: 550.09s eta: 3 days, 18:36:41
|
| 257 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:04:55][INFO] [Train] (Epoch 1) Step 2/593 lr: 0.000003 loss: 0.265 loss(reduced): 0.257 grad_norm: 2.31 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32373.0 tgs: 61 data_time: 1.13s time: 529.58s eta: 3 days, 15:05:12
|
| 258 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:13:44][INFO] [Train] (Epoch 1) Step 3/593 lr: 0.000004 loss: 0.226 loss(reduced): 0.236 grad_norm: 1.21 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31271.0 tgs: 59 data_time: 1.15s time: 529.09s eta: 3 days, 14:51:33
|
| 259 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:22:34][INFO] [Train] (Epoch 1) Step 4/593 lr: 0.000006 loss: 0.166 loss(reduced): 0.212 grad_norm: 0.33 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31959.0 tgs: 60 data_time: 0.77s time: 529.96s eta: 3 days, 14:51:17
|
| 260 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:31:23][INFO] [Train] (Epoch 1) Step 5/593 lr: 0.000007 loss: 0.210 loss(reduced): 0.214 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32119.0 tgs: 60 data_time: 0.60s time: 529.03s eta: 3 days, 14:33:16
|
| 261 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:40:13][INFO] [Train] (Epoch 1) Step 6/593 lr: 0.000009 loss: 0.181 loss(reduced): 0.221 grad_norm: 0.44 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31094.0 tgs: 58 data_time: 0.80s time: 529.86s eta: 3 days, 14:32:36
|
| 262 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:49:03][INFO] [Train] (Epoch 1) Step 7/593 lr: 0.000010 loss: 0.173 loss(reduced): 0.205 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31860.0 tgs: 60 data_time: 1.03s time: 529.63s eta: 3 days, 14:21:31
|
| 263 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 11:57:58][INFO] [Train] (Epoch 1) Step 8/593 lr: 0.000011 loss: 0.239 loss(reduced): 0.232 grad_norm: 0.34 if_nan_skip: 0 max_memory: 41.0GB text_tokens: 31574.0 tgs: 58 data_time: 0.72s time: 535.78s eta: 3 days, 15:12:44
|
| 264 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:06:52][INFO] [Train] (Epoch 1) Step 9/593 lr: 0.000013 loss: 0.223 loss(reduced): 0.215 grad_norm: 0.29 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31714.0 tgs: 59 data_time: 0.76s time: 534.03s eta: 3 days, 14:46:48
|
| 265 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:15:46][INFO] [Train] (Epoch 1) Step 10/593 lr: 0.000014 loss: 0.208 loss(reduced): 0.210 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31891.0 tgs: 59 data_time: 0.79s time: 533.28s eta: 3 days, 14:30:37
|
| 266 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:24:35][INFO] [Train] (Epoch 1) Step 11/593 lr: 0.000016 loss: 0.153 loss(reduced): 0.204 grad_norm: 0.26 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31327.0 tgs: 59 data_time: 0.84s time: 529.42s eta: 3 days, 13:44:14
|
| 267 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:33:24][INFO] [Train] (Epoch 1) Step 12/593 lr: 0.000017 loss: 0.261 loss(reduced): 0.201 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 32020.0 tgs: 60 data_time: 0.84s time: 529.02s eta: 3 days, 13:31:32
|
| 268 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:42:18][INFO] [Train] (Epoch 1) Step 13/593 lr: 0.000019 loss: 0.194 loss(reduced): 0.206 grad_norm: 0.25 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31061.0 tgs: 58 data_time: 0.70s time: 533.81s eta: 3 days, 14:09:04
|
| 269 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:51:07][INFO] [Train] (Epoch 1) Step 14/593 lr: 0.000020 loss: 0.162 loss(reduced): 0.205 grad_norm: 0.20 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 30891.0 tgs: 58 data_time: 0.74s time: 528.94s eta: 3 days, 13:13:05
|
| 270 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 12:59:57][INFO] [Train] (Epoch 1) Step 15/593 lr: 0.000020 loss: 0.170 loss(reduced): 0.198 grad_norm: 0.23 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31341.0 tgs: 59 data_time: 0.81s time: 530.39s eta: 3 days, 13:18:16
|
| 271 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:09:01][INFO] [Train] (Epoch 1) Step 16/593 lr: 0.000020 loss: 0.170 loss(reduced): 0.191 grad_norm: 0.22 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31438.0 tgs: 57 data_time: 0.87s time: 543.36s eta: 3 days, 15:14:24
|
| 272 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:17:57][INFO] [Train] (Epoch 1) Step 17/593 lr: 0.000020 loss: 0.197 loss(reduced): 0.205 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32067.0 tgs: 59 data_time: 0.66s time: 536.35s eta: 3 days, 13:57:53
|
| 273 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:26:48][INFO] [Train] (Epoch 1) Step 18/593 lr: 0.000020 loss: 0.197 loss(reduced): 0.192 grad_norm: 0.18 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32309.0 tgs: 60 data_time: 0.95s time: 530.95s eta: 3 days, 12:57:06
|
| 274 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:35:38][INFO] [Train] (Epoch 1) Step 19/593 lr: 0.000020 loss: 0.208 loss(reduced): 0.192 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31293.0 tgs: 59 data_time: 0.75s time: 529.94s eta: 3 days, 12:38:33
|
| 275 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:44:32][INFO] [Train] (Epoch 1) Step 20/593 lr: 0.000020 loss: 0.194 loss(reduced): 0.198 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.3GB text_tokens: 31896.0 tgs: 59 data_time: 0.73s time: 534.35s eta: 3 days, 13:11:57
|
| 276 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 13:53:22][INFO] [Train] (Epoch 1) Step 21/593 lr: 0.000020 loss: 0.152 loss(reduced): 0.189 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31834.0 tgs: 60 data_time: 0.69s time: 529.94s eta: 3 days, 12:20:56
|
| 277 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:02:19][INFO] [Train] (Epoch 1) Step 22/593 lr: 0.000020 loss: 0.229 loss(reduced): 0.198 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32278.0 tgs: 60 data_time: 0.73s time: 537.06s eta: 3 days, 13:19:59
|
| 278 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:11:09][INFO] [Train] (Epoch 1) Step 23/593 lr: 0.000020 loss: 0.186 loss(reduced): 0.187 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31172.0 tgs: 58 data_time: 0.90s time: 529.34s eta: 3 days, 11:57:33
|
| 279 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:20:04][INFO] [Train] (Epoch 1) Step 24/593 lr: 0.000020 loss: 0.190 loss(reduced): 0.190 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32324.0 tgs: 60 data_time: 0.93s time: 535.74s eta: 3 days, 12:49:30
|
| 280 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:29:03][INFO] [Train] (Epoch 1) Step 25/593 lr: 0.000020 loss: 0.197 loss(reduced): 0.186 grad_norm: 0.12 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 32419.0 tgs: 60 data_time: 0.89s time: 538.83s eta: 3 days, 13:09:56
|
| 281 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:37:57][INFO] [Train] (Epoch 1) Step 26/593 lr: 0.000020 loss: 0.180 loss(reduced): 0.188 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.2GB text_tokens: 31182.0 tgs: 58 data_time: 0.84s time: 533.66s eta: 3 days, 12:12:01
|
| 282 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:46:47][INFO] [Train] (Epoch 1) Step 27/593 lr: 0.000020 loss: 0.196 loss(reduced): 0.195 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31901.0 tgs: 60 data_time: 0.71s time: 529.60s eta: 3 days, 11:24:42
|
| 283 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 14:55:38][INFO] [Train] (Epoch 1) Step 28/593 lr: 0.000020 loss: 0.181 loss(reduced): 0.184 grad_norm: 0.13 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 30544.0 tgs: 57 data_time: 0.96s time: 531.66s eta: 3 days, 11:35:19
|
| 284 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:04:28][INFO] [Train] (Epoch 1) Step 29/593 lr: 0.000020 loss: 0.173 loss(reduced): 0.183 grad_norm: 0.17 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31830.0 tgs: 60 data_time: 0.75s time: 529.52s eta: 3 days, 11:06:16
|
| 285 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:13:17][INFO] [Train] (Epoch 1) Step 30/593 lr: 0.000020 loss: 0.157 loss(reduced): 0.187 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31792.0 tgs: 60 data_time: 1.14s time: 529.50s eta: 3 days, 10:57:20
|
| 286 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:22:07][INFO] [Train] (Epoch 1) Step 31/593 lr: 0.000020 loss: 0.161 loss(reduced): 0.188 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31968.0 tgs: 60 data_time: 0.82s time: 529.82s eta: 3 days, 10:51:29
|
| 287 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:31:02][INFO] [Train] (Epoch 1) Step 32/593 lr: 0.000020 loss: 0.217 loss(reduced): 0.183 grad_norm: 0.16 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31133.0 tgs: 58 data_time: 0.82s time: 534.70s eta: 3 days, 11:28:21
|
| 288 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:39:52][INFO] [Train] (Epoch 1) Step 33/593 lr: 0.000020 loss: 0.136 loss(reduced): 0.182 grad_norm: 0.15 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 32219.0 tgs: 60 data_time: 0.82s time: 529.78s eta: 3 days, 10:33:27
|
| 289 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:48:42][INFO] [Train] (Epoch 1) Step 34/593 lr: 0.000020 loss: 0.224 loss(reduced): 0.182 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31570.0 tgs: 59 data_time: 0.92s time: 530.08s eta: 3 days, 10:27:26
|
| 290 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 15:57:32][INFO] [Train] (Epoch 1) Step 35/593 lr: 0.000020 loss: 0.163 loss(reduced): 0.182 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 32328.0 tgs: 61 data_time: 0.99s time: 529.94s eta: 3 days, 10:17:16
|
| 291 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 16:06:25][INFO] [Train] (Epoch 1) Step 36/593 lr: 0.000020 loss: 0.141 loss(reduced): 0.182 grad_norm: 0.09 if_nan_skip: 0 max_memory: 41.4GB text_tokens: 31564.0 tgs: 59 data_time: 0.94s time: 533.31s eta: 3 days, 10:39:45
|
| 292 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 16:15:14][INFO] [Train] (Epoch 1) Step 37/593 lr: 0.000020 loss: 0.172 loss(reduced): 0.177 grad_norm: 0.11 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31631.0 tgs: 59 data_time: 0.99s time: 529.21s eta: 3 days, 9:52:47
|
| 293 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 16:24:04][INFO] [Train] (Epoch 1) Step 38/593 lr: 0.000020 loss: 0.163 loss(reduced): 0.183 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.6GB text_tokens: 31761.0 tgs: 59 data_time: 0.75s time: 529.66s eta: 3 days, 9:48:08
|
| 294 |
+
[XTuner][RANK 47][DP 11][SP 3][TP 0][2025-01-21 16:32:54][INFO] [Train] (Epoch 1) Step 39/593 lr: 0.000020 loss: 0.201 loss(reduced): 0.175 grad_norm: 0.10 if_nan_skip: 0 max_memory: 41.5GB text_tokens: 31545.0 tgs: 59 data_time: 0.86s time: 530.41s eta: 3 days, 9:46:16
|
20250121165312/hf-593/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
20250121165312/hf-593/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/mnt/hwfile/opendatalab/panzhuoshi/huggingface/hub/models--Qwen--Qwen2.5-72B-Instruct/snapshots/d3d951150c1e5848237cd6a7ad11df4836aee842",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Qwen2ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"attn_implementation": "flash_attention_2",
|
| 8 |
+
"bos_token_id": 151643,
|
| 9 |
+
"eos_token_id": 151645,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 8192,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 29568,
|
| 14 |
+
"max_position_embeddings": 32768,
|
| 15 |
+
"max_window_layers": 70,
|
| 16 |
+
"model_type": "qwen2",
|
| 17 |
+
"num_attention_heads": 64,
|
| 18 |
+
"num_hidden_layers": 80,
|
| 19 |
+
"num_key_value_heads": 8,
|
| 20 |
+
"rms_norm_eps": 1e-06,
|
| 21 |
+
"rope_scaling": null,
|
| 22 |
+
"rope_theta": 1000000.0,
|
| 23 |
+
"sliding_window": null,
|
| 24 |
+
"tie_word_embeddings": false,
|
| 25 |
+
"torch_dtype": "bfloat16",
|
| 26 |
+
"transformers_version": "4.45.1",
|
| 27 |
+
"use_cache": false,
|
| 28 |
+
"use_sliding_window": false,
|
| 29 |
+
"vocab_size": 152064
|
| 30 |
+
}
|
20250121165312/hf-593/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/hf-593/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,970 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 145412407296
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"lm_head.weight": "model-00031-of-00031.safetensors",
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00031.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00031.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00031.safetensors",
|
| 10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00031.safetensors",
|
| 11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00031.safetensors",
|
| 12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00031.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00031.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00031.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00031.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00031.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00031.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00031.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00031.safetensors",
|
| 20 |
+
"model.layers.1.input_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 21 |
+
"model.layers.1.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
|
| 22 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
|
| 23 |
+
"model.layers.1.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
|
| 24 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00031.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00031.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00031.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00031.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00031.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00031.safetensors",
|
| 31 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00031.safetensors",
|
| 32 |
+
"model.layers.10.input_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 33 |
+
"model.layers.10.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
|
| 34 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
|
| 35 |
+
"model.layers.10.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
|
| 36 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
|
| 42 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
|
| 43 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
|
| 44 |
+
"model.layers.11.input_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 45 |
+
"model.layers.11.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
|
| 46 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
|
| 47 |
+
"model.layers.11.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
|
| 48 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
|
| 53 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
|
| 54 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
|
| 55 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
|
| 56 |
+
"model.layers.12.input_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 57 |
+
"model.layers.12.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
|
| 58 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
|
| 59 |
+
"model.layers.12.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
|
| 60 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
|
| 64 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
|
| 65 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
|
| 66 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
|
| 67 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
|
| 68 |
+
"model.layers.13.input_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 69 |
+
"model.layers.13.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
|
| 70 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
|
| 71 |
+
"model.layers.13.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
|
| 72 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00006-of-00031.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
|
| 75 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
|
| 76 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00006-of-00031.safetensors",
|
| 77 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
|
| 78 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00006-of-00031.safetensors",
|
| 79 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
|
| 80 |
+
"model.layers.14.input_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 81 |
+
"model.layers.14.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
|
| 82 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
|
| 83 |
+
"model.layers.14.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
|
| 84 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
|
| 85 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00006-of-00031.safetensors",
|
| 86 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
|
| 87 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
|
| 88 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00006-of-00031.safetensors",
|
| 89 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
|
| 90 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00006-of-00031.safetensors",
|
| 91 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
|
| 92 |
+
"model.layers.15.input_layernorm.weight": "model-00007-of-00031.safetensors",
|
| 93 |
+
"model.layers.15.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
|
| 94 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
|
| 95 |
+
"model.layers.15.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
|
| 96 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
|
| 97 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
|
| 98 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
|
| 99 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
|
| 100 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
|
| 101 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
|
| 102 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
|
| 103 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
|
| 104 |
+
"model.layers.16.input_layernorm.weight": "model-00007-of-00031.safetensors",
|
| 105 |
+
"model.layers.16.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
|
| 106 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
|
| 107 |
+
"model.layers.16.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
|
| 108 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
|
| 109 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
|
| 110 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
|
| 111 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
|
| 112 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
|
| 113 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
|
| 114 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
|
| 115 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
|
| 116 |
+
"model.layers.17.input_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 117 |
+
"model.layers.17.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
|
| 118 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
|
| 119 |
+
"model.layers.17.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
|
| 120 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 121 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
|
| 122 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
|
| 123 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
|
| 124 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
|
| 125 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
|
| 126 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
|
| 127 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
|
| 128 |
+
"model.layers.18.input_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 129 |
+
"model.layers.18.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
|
| 130 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
|
| 131 |
+
"model.layers.18.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
|
| 132 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 133 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
|
| 134 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
|
| 135 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
|
| 136 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
|
| 137 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
|
| 138 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
|
| 139 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
|
| 140 |
+
"model.layers.19.input_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 141 |
+
"model.layers.19.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
|
| 142 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
|
| 143 |
+
"model.layers.19.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
|
| 144 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
|
| 145 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
|
| 146 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
|
| 147 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
|
| 148 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
|
| 149 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
|
| 150 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
|
| 151 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
|
| 152 |
+
"model.layers.2.input_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 153 |
+
"model.layers.2.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
|
| 154 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
|
| 155 |
+
"model.layers.2.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
|
| 156 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 157 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00002-of-00031.safetensors",
|
| 158 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
|
| 159 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
|
| 160 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00002-of-00031.safetensors",
|
| 161 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
|
| 162 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00002-of-00031.safetensors",
|
| 163 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
|
| 164 |
+
"model.layers.20.input_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 165 |
+
"model.layers.20.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
|
| 166 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
|
| 167 |
+
"model.layers.20.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
|
| 168 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 169 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
|
| 170 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
|
| 171 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
|
| 172 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
|
| 173 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
|
| 174 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
|
| 175 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
|
| 176 |
+
"model.layers.21.input_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 177 |
+
"model.layers.21.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
|
| 178 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
|
| 179 |
+
"model.layers.21.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
|
| 180 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 181 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
|
| 182 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
|
| 183 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
|
| 184 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
|
| 185 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
|
| 186 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
|
| 187 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
|
| 188 |
+
"model.layers.22.input_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 189 |
+
"model.layers.22.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
|
| 190 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
|
| 191 |
+
"model.layers.22.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
|
| 192 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
|
| 193 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
|
| 194 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
|
| 195 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
|
| 196 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
|
| 197 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
|
| 198 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
|
| 199 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
|
| 200 |
+
"model.layers.23.input_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 201 |
+
"model.layers.23.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
|
| 202 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
|
| 203 |
+
"model.layers.23.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
|
| 204 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 205 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
|
| 206 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
|
| 207 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
|
| 208 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
|
| 209 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
|
| 210 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
|
| 211 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
|
| 212 |
+
"model.layers.24.input_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 213 |
+
"model.layers.24.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
|
| 214 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
|
| 215 |
+
"model.layers.24.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
|
| 216 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 217 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00010-of-00031.safetensors",
|
| 218 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
|
| 219 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
|
| 220 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00010-of-00031.safetensors",
|
| 221 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
|
| 222 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00010-of-00031.safetensors",
|
| 223 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
|
| 224 |
+
"model.layers.25.input_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 225 |
+
"model.layers.25.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
|
| 226 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
|
| 227 |
+
"model.layers.25.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
|
| 228 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
|
| 229 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00010-of-00031.safetensors",
|
| 230 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
|
| 231 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
|
| 232 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00010-of-00031.safetensors",
|
| 233 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
|
| 234 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00010-of-00031.safetensors",
|
| 235 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
|
| 236 |
+
"model.layers.26.input_layernorm.weight": "model-00011-of-00031.safetensors",
|
| 237 |
+
"model.layers.26.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
|
| 238 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
|
| 239 |
+
"model.layers.26.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
|
| 240 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
|
| 241 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
|
| 242 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
|
| 243 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
|
| 244 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
|
| 245 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
|
| 246 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
|
| 247 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
|
| 248 |
+
"model.layers.27.input_layernorm.weight": "model-00011-of-00031.safetensors",
|
| 249 |
+
"model.layers.27.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
|
| 250 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
|
| 251 |
+
"model.layers.27.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
|
| 252 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
|
| 253 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
|
| 254 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
|
| 255 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
|
| 256 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
|
| 257 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
|
| 258 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
|
| 259 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
|
| 260 |
+
"model.layers.28.input_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 261 |
+
"model.layers.28.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
|
| 262 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
|
| 263 |
+
"model.layers.28.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
|
| 264 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 265 |
+
"model.layers.28.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
|
| 266 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
|
| 267 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
|
| 268 |
+
"model.layers.28.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
|
| 269 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
|
| 270 |
+
"model.layers.28.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
|
| 271 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
|
| 272 |
+
"model.layers.29.input_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 273 |
+
"model.layers.29.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
|
| 274 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
|
| 275 |
+
"model.layers.29.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
|
| 276 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 277 |
+
"model.layers.29.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
|
| 278 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
|
| 279 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
|
| 280 |
+
"model.layers.29.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
|
| 281 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
|
| 282 |
+
"model.layers.29.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
|
| 283 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
|
| 284 |
+
"model.layers.3.input_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 285 |
+
"model.layers.3.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
|
| 286 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
|
| 287 |
+
"model.layers.3.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
|
| 288 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
|
| 289 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00002-of-00031.safetensors",
|
| 290 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
|
| 291 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
|
| 292 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00002-of-00031.safetensors",
|
| 293 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
|
| 294 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00002-of-00031.safetensors",
|
| 295 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
|
| 296 |
+
"model.layers.30.input_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 297 |
+
"model.layers.30.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
|
| 298 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
|
| 299 |
+
"model.layers.30.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
|
| 300 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
|
| 301 |
+
"model.layers.30.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
|
| 302 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
|
| 303 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
|
| 304 |
+
"model.layers.30.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
|
| 305 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
|
| 306 |
+
"model.layers.30.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
|
| 307 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
|
| 308 |
+
"model.layers.31.input_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 309 |
+
"model.layers.31.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
|
| 310 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
|
| 311 |
+
"model.layers.31.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
|
| 312 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 313 |
+
"model.layers.31.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
|
| 314 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
|
| 315 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
|
| 316 |
+
"model.layers.31.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
|
| 317 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
|
| 318 |
+
"model.layers.31.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
|
| 319 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
|
| 320 |
+
"model.layers.32.input_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 321 |
+
"model.layers.32.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
|
| 322 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
|
| 323 |
+
"model.layers.32.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
|
| 324 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 325 |
+
"model.layers.32.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
|
| 326 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
|
| 327 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
|
| 328 |
+
"model.layers.32.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
|
| 329 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
|
| 330 |
+
"model.layers.32.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
|
| 331 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
|
| 332 |
+
"model.layers.33.input_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 333 |
+
"model.layers.33.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
|
| 334 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
|
| 335 |
+
"model.layers.33.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
|
| 336 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
|
| 337 |
+
"model.layers.33.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
|
| 338 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
|
| 339 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
|
| 340 |
+
"model.layers.33.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
|
| 341 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
|
| 342 |
+
"model.layers.33.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
|
| 343 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
|
| 344 |
+
"model.layers.34.input_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 345 |
+
"model.layers.34.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
|
| 346 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
|
| 347 |
+
"model.layers.34.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
|
| 348 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 349 |
+
"model.layers.34.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
|
| 350 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
|
| 351 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
|
| 352 |
+
"model.layers.34.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
|
| 353 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
|
| 354 |
+
"model.layers.34.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
|
| 355 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
|
| 356 |
+
"model.layers.35.input_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 357 |
+
"model.layers.35.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
|
| 358 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
|
| 359 |
+
"model.layers.35.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
|
| 360 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 361 |
+
"model.layers.35.self_attn.k_proj.bias": "model-00014-of-00031.safetensors",
|
| 362 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
|
| 363 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
|
| 364 |
+
"model.layers.35.self_attn.q_proj.bias": "model-00014-of-00031.safetensors",
|
| 365 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
|
| 366 |
+
"model.layers.35.self_attn.v_proj.bias": "model-00014-of-00031.safetensors",
|
| 367 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
|
| 368 |
+
"model.layers.36.input_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 369 |
+
"model.layers.36.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
|
| 370 |
+
"model.layers.36.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
|
| 371 |
+
"model.layers.36.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
|
| 372 |
+
"model.layers.36.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
|
| 373 |
+
"model.layers.36.self_attn.k_proj.bias": "model-00014-of-00031.safetensors",
|
| 374 |
+
"model.layers.36.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
|
| 375 |
+
"model.layers.36.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
|
| 376 |
+
"model.layers.36.self_attn.q_proj.bias": "model-00014-of-00031.safetensors",
|
| 377 |
+
"model.layers.36.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
|
| 378 |
+
"model.layers.36.self_attn.v_proj.bias": "model-00014-of-00031.safetensors",
|
| 379 |
+
"model.layers.36.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
|
| 380 |
+
"model.layers.37.input_layernorm.weight": "model-00015-of-00031.safetensors",
|
| 381 |
+
"model.layers.37.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
|
| 382 |
+
"model.layers.37.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
|
| 383 |
+
"model.layers.37.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
|
| 384 |
+
"model.layers.37.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
|
| 385 |
+
"model.layers.37.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
|
| 386 |
+
"model.layers.37.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
|
| 387 |
+
"model.layers.37.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
|
| 388 |
+
"model.layers.37.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
|
| 389 |
+
"model.layers.37.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
|
| 390 |
+
"model.layers.37.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
|
| 391 |
+
"model.layers.37.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
|
| 392 |
+
"model.layers.38.input_layernorm.weight": "model-00015-of-00031.safetensors",
|
| 393 |
+
"model.layers.38.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
|
| 394 |
+
"model.layers.38.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
|
| 395 |
+
"model.layers.38.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
|
| 396 |
+
"model.layers.38.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
|
| 397 |
+
"model.layers.38.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
|
| 398 |
+
"model.layers.38.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
|
| 399 |
+
"model.layers.38.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
|
| 400 |
+
"model.layers.38.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
|
| 401 |
+
"model.layers.38.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
|
| 402 |
+
"model.layers.38.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
|
| 403 |
+
"model.layers.38.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
|
| 404 |
+
"model.layers.39.input_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 405 |
+
"model.layers.39.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
|
| 406 |
+
"model.layers.39.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
|
| 407 |
+
"model.layers.39.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
|
| 408 |
+
"model.layers.39.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 409 |
+
"model.layers.39.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
|
| 410 |
+
"model.layers.39.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
|
| 411 |
+
"model.layers.39.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
|
| 412 |
+
"model.layers.39.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
|
| 413 |
+
"model.layers.39.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
|
| 414 |
+
"model.layers.39.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
|
| 415 |
+
"model.layers.39.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
|
| 416 |
+
"model.layers.4.input_layernorm.weight": "model-00003-of-00031.safetensors",
|
| 417 |
+
"model.layers.4.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
|
| 418 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
|
| 419 |
+
"model.layers.4.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
|
| 420 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
|
| 421 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
|
| 422 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
|
| 423 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
|
| 424 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
|
| 425 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
|
| 426 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
|
| 427 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
|
| 428 |
+
"model.layers.40.input_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 429 |
+
"model.layers.40.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
|
| 430 |
+
"model.layers.40.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
|
| 431 |
+
"model.layers.40.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
|
| 432 |
+
"model.layers.40.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 433 |
+
"model.layers.40.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
|
| 434 |
+
"model.layers.40.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
|
| 435 |
+
"model.layers.40.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
|
| 436 |
+
"model.layers.40.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
|
| 437 |
+
"model.layers.40.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
|
| 438 |
+
"model.layers.40.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
|
| 439 |
+
"model.layers.40.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
|
| 440 |
+
"model.layers.41.input_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 441 |
+
"model.layers.41.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
|
| 442 |
+
"model.layers.41.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
|
| 443 |
+
"model.layers.41.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
|
| 444 |
+
"model.layers.41.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
|
| 445 |
+
"model.layers.41.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
|
| 446 |
+
"model.layers.41.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
|
| 447 |
+
"model.layers.41.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
|
| 448 |
+
"model.layers.41.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
|
| 449 |
+
"model.layers.41.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
|
| 450 |
+
"model.layers.41.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
|
| 451 |
+
"model.layers.41.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
|
| 452 |
+
"model.layers.42.input_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 453 |
+
"model.layers.42.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
|
| 454 |
+
"model.layers.42.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
|
| 455 |
+
"model.layers.42.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
|
| 456 |
+
"model.layers.42.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 457 |
+
"model.layers.42.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
|
| 458 |
+
"model.layers.42.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
|
| 459 |
+
"model.layers.42.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
|
| 460 |
+
"model.layers.42.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
|
| 461 |
+
"model.layers.42.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
|
| 462 |
+
"model.layers.42.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
|
| 463 |
+
"model.layers.42.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
|
| 464 |
+
"model.layers.43.input_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 465 |
+
"model.layers.43.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
|
| 466 |
+
"model.layers.43.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
|
| 467 |
+
"model.layers.43.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
|
| 468 |
+
"model.layers.43.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 469 |
+
"model.layers.43.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
|
| 470 |
+
"model.layers.43.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
|
| 471 |
+
"model.layers.43.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
|
| 472 |
+
"model.layers.43.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
|
| 473 |
+
"model.layers.43.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
|
| 474 |
+
"model.layers.43.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
|
| 475 |
+
"model.layers.43.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
|
| 476 |
+
"model.layers.44.input_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 477 |
+
"model.layers.44.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
|
| 478 |
+
"model.layers.44.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
|
| 479 |
+
"model.layers.44.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
|
| 480 |
+
"model.layers.44.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
|
| 481 |
+
"model.layers.44.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
|
| 482 |
+
"model.layers.44.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
|
| 483 |
+
"model.layers.44.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
|
| 484 |
+
"model.layers.44.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
|
| 485 |
+
"model.layers.44.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
|
| 486 |
+
"model.layers.44.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
|
| 487 |
+
"model.layers.44.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
|
| 488 |
+
"model.layers.45.input_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 489 |
+
"model.layers.45.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
|
| 490 |
+
"model.layers.45.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
|
| 491 |
+
"model.layers.45.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
|
| 492 |
+
"model.layers.45.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 493 |
+
"model.layers.45.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
|
| 494 |
+
"model.layers.45.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
|
| 495 |
+
"model.layers.45.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
|
| 496 |
+
"model.layers.45.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
|
| 497 |
+
"model.layers.45.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
|
| 498 |
+
"model.layers.45.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
|
| 499 |
+
"model.layers.45.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
|
| 500 |
+
"model.layers.46.input_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 501 |
+
"model.layers.46.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
|
| 502 |
+
"model.layers.46.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
|
| 503 |
+
"model.layers.46.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
|
| 504 |
+
"model.layers.46.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 505 |
+
"model.layers.46.self_attn.k_proj.bias": "model-00018-of-00031.safetensors",
|
| 506 |
+
"model.layers.46.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
|
| 507 |
+
"model.layers.46.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
|
| 508 |
+
"model.layers.46.self_attn.q_proj.bias": "model-00018-of-00031.safetensors",
|
| 509 |
+
"model.layers.46.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
|
| 510 |
+
"model.layers.46.self_attn.v_proj.bias": "model-00018-of-00031.safetensors",
|
| 511 |
+
"model.layers.46.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
|
| 512 |
+
"model.layers.47.input_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 513 |
+
"model.layers.47.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
|
| 514 |
+
"model.layers.47.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
|
| 515 |
+
"model.layers.47.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
|
| 516 |
+
"model.layers.47.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
|
| 517 |
+
"model.layers.47.self_attn.k_proj.bias": "model-00018-of-00031.safetensors",
|
| 518 |
+
"model.layers.47.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
|
| 519 |
+
"model.layers.47.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
|
| 520 |
+
"model.layers.47.self_attn.q_proj.bias": "model-00018-of-00031.safetensors",
|
| 521 |
+
"model.layers.47.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
|
| 522 |
+
"model.layers.47.self_attn.v_proj.bias": "model-00018-of-00031.safetensors",
|
| 523 |
+
"model.layers.47.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
|
| 524 |
+
"model.layers.48.input_layernorm.weight": "model-00019-of-00031.safetensors",
|
| 525 |
+
"model.layers.48.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
|
| 526 |
+
"model.layers.48.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
|
| 527 |
+
"model.layers.48.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
|
| 528 |
+
"model.layers.48.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
|
| 529 |
+
"model.layers.48.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
|
| 530 |
+
"model.layers.48.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
|
| 531 |
+
"model.layers.48.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
|
| 532 |
+
"model.layers.48.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
|
| 533 |
+
"model.layers.48.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
|
| 534 |
+
"model.layers.48.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
|
| 535 |
+
"model.layers.48.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
|
| 536 |
+
"model.layers.49.input_layernorm.weight": "model-00019-of-00031.safetensors",
|
| 537 |
+
"model.layers.49.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
|
| 538 |
+
"model.layers.49.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
|
| 539 |
+
"model.layers.49.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
|
| 540 |
+
"model.layers.49.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
|
| 541 |
+
"model.layers.49.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
|
| 542 |
+
"model.layers.49.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
|
| 543 |
+
"model.layers.49.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
|
| 544 |
+
"model.layers.49.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
|
| 545 |
+
"model.layers.49.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
|
| 546 |
+
"model.layers.49.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
|
| 547 |
+
"model.layers.49.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
|
| 548 |
+
"model.layers.5.input_layernorm.weight": "model-00003-of-00031.safetensors",
|
| 549 |
+
"model.layers.5.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
|
| 550 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
|
| 551 |
+
"model.layers.5.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
|
| 552 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
|
| 553 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
|
| 554 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
|
| 555 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
|
| 556 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
|
| 557 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
|
| 558 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
|
| 559 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
|
| 560 |
+
"model.layers.50.input_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 561 |
+
"model.layers.50.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
|
| 562 |
+
"model.layers.50.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
|
| 563 |
+
"model.layers.50.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
|
| 564 |
+
"model.layers.50.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 565 |
+
"model.layers.50.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
|
| 566 |
+
"model.layers.50.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
|
| 567 |
+
"model.layers.50.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
|
| 568 |
+
"model.layers.50.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
|
| 569 |
+
"model.layers.50.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
|
| 570 |
+
"model.layers.50.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
|
| 571 |
+
"model.layers.50.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
|
| 572 |
+
"model.layers.51.input_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 573 |
+
"model.layers.51.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
|
| 574 |
+
"model.layers.51.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
|
| 575 |
+
"model.layers.51.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
|
| 576 |
+
"model.layers.51.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 577 |
+
"model.layers.51.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
|
| 578 |
+
"model.layers.51.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
|
| 579 |
+
"model.layers.51.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
|
| 580 |
+
"model.layers.51.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
|
| 581 |
+
"model.layers.51.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
|
| 582 |
+
"model.layers.51.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
|
| 583 |
+
"model.layers.51.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
|
| 584 |
+
"model.layers.52.input_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 585 |
+
"model.layers.52.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
|
| 586 |
+
"model.layers.52.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
|
| 587 |
+
"model.layers.52.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
|
| 588 |
+
"model.layers.52.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
|
| 589 |
+
"model.layers.52.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
|
| 590 |
+
"model.layers.52.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
|
| 591 |
+
"model.layers.52.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
|
| 592 |
+
"model.layers.52.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
|
| 593 |
+
"model.layers.52.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
|
| 594 |
+
"model.layers.52.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
|
| 595 |
+
"model.layers.52.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
|
| 596 |
+
"model.layers.53.input_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 597 |
+
"model.layers.53.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
|
| 598 |
+
"model.layers.53.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
|
| 599 |
+
"model.layers.53.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
|
| 600 |
+
"model.layers.53.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 601 |
+
"model.layers.53.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
|
| 602 |
+
"model.layers.53.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
|
| 603 |
+
"model.layers.53.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
|
| 604 |
+
"model.layers.53.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
|
| 605 |
+
"model.layers.53.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
|
| 606 |
+
"model.layers.53.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
|
| 607 |
+
"model.layers.53.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
|
| 608 |
+
"model.layers.54.input_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 609 |
+
"model.layers.54.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
|
| 610 |
+
"model.layers.54.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
|
| 611 |
+
"model.layers.54.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
|
| 612 |
+
"model.layers.54.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 613 |
+
"model.layers.54.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
|
| 614 |
+
"model.layers.54.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
|
| 615 |
+
"model.layers.54.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
|
| 616 |
+
"model.layers.54.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
|
| 617 |
+
"model.layers.54.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
|
| 618 |
+
"model.layers.54.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
|
| 619 |
+
"model.layers.54.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
|
| 620 |
+
"model.layers.55.input_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 621 |
+
"model.layers.55.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
|
| 622 |
+
"model.layers.55.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
|
| 623 |
+
"model.layers.55.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
|
| 624 |
+
"model.layers.55.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
|
| 625 |
+
"model.layers.55.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
|
| 626 |
+
"model.layers.55.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
|
| 627 |
+
"model.layers.55.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
|
| 628 |
+
"model.layers.55.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
|
| 629 |
+
"model.layers.55.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
|
| 630 |
+
"model.layers.55.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
|
| 631 |
+
"model.layers.55.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
|
| 632 |
+
"model.layers.56.input_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 633 |
+
"model.layers.56.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
|
| 634 |
+
"model.layers.56.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
|
| 635 |
+
"model.layers.56.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
|
| 636 |
+
"model.layers.56.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 637 |
+
"model.layers.56.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
|
| 638 |
+
"model.layers.56.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
|
| 639 |
+
"model.layers.56.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
|
| 640 |
+
"model.layers.56.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
|
| 641 |
+
"model.layers.56.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
|
| 642 |
+
"model.layers.56.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
|
| 643 |
+
"model.layers.56.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
|
| 644 |
+
"model.layers.57.input_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 645 |
+
"model.layers.57.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
|
| 646 |
+
"model.layers.57.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
|
| 647 |
+
"model.layers.57.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
|
| 648 |
+
"model.layers.57.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 649 |
+
"model.layers.57.self_attn.k_proj.bias": "model-00022-of-00031.safetensors",
|
| 650 |
+
"model.layers.57.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
|
| 651 |
+
"model.layers.57.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
|
| 652 |
+
"model.layers.57.self_attn.q_proj.bias": "model-00022-of-00031.safetensors",
|
| 653 |
+
"model.layers.57.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
|
| 654 |
+
"model.layers.57.self_attn.v_proj.bias": "model-00022-of-00031.safetensors",
|
| 655 |
+
"model.layers.57.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
|
| 656 |
+
"model.layers.58.input_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 657 |
+
"model.layers.58.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
|
| 658 |
+
"model.layers.58.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
|
| 659 |
+
"model.layers.58.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
|
| 660 |
+
"model.layers.58.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
|
| 661 |
+
"model.layers.58.self_attn.k_proj.bias": "model-00022-of-00031.safetensors",
|
| 662 |
+
"model.layers.58.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
|
| 663 |
+
"model.layers.58.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
|
| 664 |
+
"model.layers.58.self_attn.q_proj.bias": "model-00022-of-00031.safetensors",
|
| 665 |
+
"model.layers.58.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
|
| 666 |
+
"model.layers.58.self_attn.v_proj.bias": "model-00022-of-00031.safetensors",
|
| 667 |
+
"model.layers.58.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
|
| 668 |
+
"model.layers.59.input_layernorm.weight": "model-00023-of-00031.safetensors",
|
| 669 |
+
"model.layers.59.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
|
| 670 |
+
"model.layers.59.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
|
| 671 |
+
"model.layers.59.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
|
| 672 |
+
"model.layers.59.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
|
| 673 |
+
"model.layers.59.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
|
| 674 |
+
"model.layers.59.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
|
| 675 |
+
"model.layers.59.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
|
| 676 |
+
"model.layers.59.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
|
| 677 |
+
"model.layers.59.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
|
| 678 |
+
"model.layers.59.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
|
| 679 |
+
"model.layers.59.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
|
| 680 |
+
"model.layers.6.input_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 681 |
+
"model.layers.6.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
|
| 682 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
|
| 683 |
+
"model.layers.6.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
|
| 684 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 685 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
|
| 686 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
|
| 687 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
|
| 688 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
|
| 689 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
|
| 690 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
|
| 691 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
|
| 692 |
+
"model.layers.60.input_layernorm.weight": "model-00023-of-00031.safetensors",
|
| 693 |
+
"model.layers.60.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
|
| 694 |
+
"model.layers.60.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
|
| 695 |
+
"model.layers.60.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
|
| 696 |
+
"model.layers.60.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
|
| 697 |
+
"model.layers.60.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
|
| 698 |
+
"model.layers.60.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
|
| 699 |
+
"model.layers.60.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
|
| 700 |
+
"model.layers.60.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
|
| 701 |
+
"model.layers.60.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
|
| 702 |
+
"model.layers.60.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
|
| 703 |
+
"model.layers.60.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
|
| 704 |
+
"model.layers.61.input_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 705 |
+
"model.layers.61.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
|
| 706 |
+
"model.layers.61.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
|
| 707 |
+
"model.layers.61.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
|
| 708 |
+
"model.layers.61.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 709 |
+
"model.layers.61.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
|
| 710 |
+
"model.layers.61.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
|
| 711 |
+
"model.layers.61.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
|
| 712 |
+
"model.layers.61.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
|
| 713 |
+
"model.layers.61.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
|
| 714 |
+
"model.layers.61.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
|
| 715 |
+
"model.layers.61.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
|
| 716 |
+
"model.layers.62.input_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 717 |
+
"model.layers.62.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
|
| 718 |
+
"model.layers.62.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
|
| 719 |
+
"model.layers.62.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
|
| 720 |
+
"model.layers.62.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 721 |
+
"model.layers.62.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
|
| 722 |
+
"model.layers.62.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
|
| 723 |
+
"model.layers.62.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
|
| 724 |
+
"model.layers.62.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
|
| 725 |
+
"model.layers.62.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
|
| 726 |
+
"model.layers.62.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
|
| 727 |
+
"model.layers.62.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
|
| 728 |
+
"model.layers.63.input_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 729 |
+
"model.layers.63.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
|
| 730 |
+
"model.layers.63.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
|
| 731 |
+
"model.layers.63.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
|
| 732 |
+
"model.layers.63.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
|
| 733 |
+
"model.layers.63.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
|
| 734 |
+
"model.layers.63.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
|
| 735 |
+
"model.layers.63.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
|
| 736 |
+
"model.layers.63.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
|
| 737 |
+
"model.layers.63.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
|
| 738 |
+
"model.layers.63.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
|
| 739 |
+
"model.layers.63.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
|
| 740 |
+
"model.layers.64.input_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 741 |
+
"model.layers.64.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
|
| 742 |
+
"model.layers.64.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
|
| 743 |
+
"model.layers.64.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
|
| 744 |
+
"model.layers.64.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 745 |
+
"model.layers.64.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
|
| 746 |
+
"model.layers.64.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
|
| 747 |
+
"model.layers.64.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
|
| 748 |
+
"model.layers.64.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
|
| 749 |
+
"model.layers.64.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
|
| 750 |
+
"model.layers.64.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
|
| 751 |
+
"model.layers.64.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
|
| 752 |
+
"model.layers.65.input_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 753 |
+
"model.layers.65.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
|
| 754 |
+
"model.layers.65.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
|
| 755 |
+
"model.layers.65.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
|
| 756 |
+
"model.layers.65.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 757 |
+
"model.layers.65.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
|
| 758 |
+
"model.layers.65.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
|
| 759 |
+
"model.layers.65.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
|
| 760 |
+
"model.layers.65.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
|
| 761 |
+
"model.layers.65.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
|
| 762 |
+
"model.layers.65.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
|
| 763 |
+
"model.layers.65.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
|
| 764 |
+
"model.layers.66.input_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 765 |
+
"model.layers.66.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
|
| 766 |
+
"model.layers.66.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
|
| 767 |
+
"model.layers.66.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
|
| 768 |
+
"model.layers.66.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
|
| 769 |
+
"model.layers.66.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
|
| 770 |
+
"model.layers.66.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
|
| 771 |
+
"model.layers.66.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
|
| 772 |
+
"model.layers.66.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
|
| 773 |
+
"model.layers.66.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
|
| 774 |
+
"model.layers.66.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
|
| 775 |
+
"model.layers.66.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
|
| 776 |
+
"model.layers.67.input_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 777 |
+
"model.layers.67.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
|
| 778 |
+
"model.layers.67.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
|
| 779 |
+
"model.layers.67.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
|
| 780 |
+
"model.layers.67.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 781 |
+
"model.layers.67.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
|
| 782 |
+
"model.layers.67.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
|
| 783 |
+
"model.layers.67.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
|
| 784 |
+
"model.layers.67.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
|
| 785 |
+
"model.layers.67.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
|
| 786 |
+
"model.layers.67.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
|
| 787 |
+
"model.layers.67.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
|
| 788 |
+
"model.layers.68.input_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 789 |
+
"model.layers.68.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
|
| 790 |
+
"model.layers.68.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
|
| 791 |
+
"model.layers.68.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
|
| 792 |
+
"model.layers.68.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 793 |
+
"model.layers.68.self_attn.k_proj.bias": "model-00026-of-00031.safetensors",
|
| 794 |
+
"model.layers.68.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
|
| 795 |
+
"model.layers.68.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
|
| 796 |
+
"model.layers.68.self_attn.q_proj.bias": "model-00026-of-00031.safetensors",
|
| 797 |
+
"model.layers.68.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
|
| 798 |
+
"model.layers.68.self_attn.v_proj.bias": "model-00026-of-00031.safetensors",
|
| 799 |
+
"model.layers.68.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
|
| 800 |
+
"model.layers.69.input_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 801 |
+
"model.layers.69.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
|
| 802 |
+
"model.layers.69.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
|
| 803 |
+
"model.layers.69.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
|
| 804 |
+
"model.layers.69.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
|
| 805 |
+
"model.layers.69.self_attn.k_proj.bias": "model-00026-of-00031.safetensors",
|
| 806 |
+
"model.layers.69.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
|
| 807 |
+
"model.layers.69.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
|
| 808 |
+
"model.layers.69.self_attn.q_proj.bias": "model-00026-of-00031.safetensors",
|
| 809 |
+
"model.layers.69.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
|
| 810 |
+
"model.layers.69.self_attn.v_proj.bias": "model-00026-of-00031.safetensors",
|
| 811 |
+
"model.layers.69.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
|
| 812 |
+
"model.layers.7.input_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 813 |
+
"model.layers.7.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
|
| 814 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
|
| 815 |
+
"model.layers.7.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
|
| 816 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 817 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
|
| 818 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
|
| 819 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
|
| 820 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
|
| 821 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
|
| 822 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
|
| 823 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
|
| 824 |
+
"model.layers.70.input_layernorm.weight": "model-00027-of-00031.safetensors",
|
| 825 |
+
"model.layers.70.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
|
| 826 |
+
"model.layers.70.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
|
| 827 |
+
"model.layers.70.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
|
| 828 |
+
"model.layers.70.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
|
| 829 |
+
"model.layers.70.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
|
| 830 |
+
"model.layers.70.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
|
| 831 |
+
"model.layers.70.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
|
| 832 |
+
"model.layers.70.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
|
| 833 |
+
"model.layers.70.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
|
| 834 |
+
"model.layers.70.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
|
| 835 |
+
"model.layers.70.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
|
| 836 |
+
"model.layers.71.input_layernorm.weight": "model-00027-of-00031.safetensors",
|
| 837 |
+
"model.layers.71.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
|
| 838 |
+
"model.layers.71.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
|
| 839 |
+
"model.layers.71.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
|
| 840 |
+
"model.layers.71.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
|
| 841 |
+
"model.layers.71.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
|
| 842 |
+
"model.layers.71.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
|
| 843 |
+
"model.layers.71.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
|
| 844 |
+
"model.layers.71.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
|
| 845 |
+
"model.layers.71.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
|
| 846 |
+
"model.layers.71.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
|
| 847 |
+
"model.layers.71.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
|
| 848 |
+
"model.layers.72.input_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 849 |
+
"model.layers.72.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
|
| 850 |
+
"model.layers.72.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
|
| 851 |
+
"model.layers.72.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
|
| 852 |
+
"model.layers.72.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 853 |
+
"model.layers.72.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
|
| 854 |
+
"model.layers.72.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
|
| 855 |
+
"model.layers.72.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
|
| 856 |
+
"model.layers.72.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
|
| 857 |
+
"model.layers.72.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
|
| 858 |
+
"model.layers.72.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
|
| 859 |
+
"model.layers.72.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
|
| 860 |
+
"model.layers.73.input_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 861 |
+
"model.layers.73.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
|
| 862 |
+
"model.layers.73.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
|
| 863 |
+
"model.layers.73.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
|
| 864 |
+
"model.layers.73.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 865 |
+
"model.layers.73.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
|
| 866 |
+
"model.layers.73.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
|
| 867 |
+
"model.layers.73.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
|
| 868 |
+
"model.layers.73.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
|
| 869 |
+
"model.layers.73.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
|
| 870 |
+
"model.layers.73.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
|
| 871 |
+
"model.layers.73.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
|
| 872 |
+
"model.layers.74.input_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 873 |
+
"model.layers.74.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
|
| 874 |
+
"model.layers.74.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
|
| 875 |
+
"model.layers.74.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
|
| 876 |
+
"model.layers.74.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
|
| 877 |
+
"model.layers.74.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
|
| 878 |
+
"model.layers.74.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
|
| 879 |
+
"model.layers.74.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
|
| 880 |
+
"model.layers.74.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
|
| 881 |
+
"model.layers.74.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
|
| 882 |
+
"model.layers.74.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
|
| 883 |
+
"model.layers.74.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
|
| 884 |
+
"model.layers.75.input_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 885 |
+
"model.layers.75.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
|
| 886 |
+
"model.layers.75.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
|
| 887 |
+
"model.layers.75.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
|
| 888 |
+
"model.layers.75.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 889 |
+
"model.layers.75.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
|
| 890 |
+
"model.layers.75.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
|
| 891 |
+
"model.layers.75.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
|
| 892 |
+
"model.layers.75.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
|
| 893 |
+
"model.layers.75.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
|
| 894 |
+
"model.layers.75.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
|
| 895 |
+
"model.layers.75.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
|
| 896 |
+
"model.layers.76.input_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 897 |
+
"model.layers.76.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
|
| 898 |
+
"model.layers.76.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
|
| 899 |
+
"model.layers.76.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
|
| 900 |
+
"model.layers.76.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 901 |
+
"model.layers.76.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
|
| 902 |
+
"model.layers.76.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
|
| 903 |
+
"model.layers.76.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
|
| 904 |
+
"model.layers.76.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
|
| 905 |
+
"model.layers.76.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
|
| 906 |
+
"model.layers.76.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
|
| 907 |
+
"model.layers.76.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
|
| 908 |
+
"model.layers.77.input_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 909 |
+
"model.layers.77.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
|
| 910 |
+
"model.layers.77.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
|
| 911 |
+
"model.layers.77.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
|
| 912 |
+
"model.layers.77.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
|
| 913 |
+
"model.layers.77.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
|
| 914 |
+
"model.layers.77.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
|
| 915 |
+
"model.layers.77.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
|
| 916 |
+
"model.layers.77.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
|
| 917 |
+
"model.layers.77.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
|
| 918 |
+
"model.layers.77.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
|
| 919 |
+
"model.layers.77.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
|
| 920 |
+
"model.layers.78.input_layernorm.weight": "model-00030-of-00031.safetensors",
|
| 921 |
+
"model.layers.78.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
|
| 922 |
+
"model.layers.78.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
|
| 923 |
+
"model.layers.78.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
|
| 924 |
+
"model.layers.78.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
|
| 925 |
+
"model.layers.78.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
|
| 926 |
+
"model.layers.78.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
|
| 927 |
+
"model.layers.78.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
|
| 928 |
+
"model.layers.78.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
|
| 929 |
+
"model.layers.78.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
|
| 930 |
+
"model.layers.78.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
|
| 931 |
+
"model.layers.78.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
|
| 932 |
+
"model.layers.79.input_layernorm.weight": "model-00030-of-00031.safetensors",
|
| 933 |
+
"model.layers.79.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
|
| 934 |
+
"model.layers.79.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
|
| 935 |
+
"model.layers.79.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
|
| 936 |
+
"model.layers.79.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
|
| 937 |
+
"model.layers.79.self_attn.k_proj.bias": "model-00030-of-00031.safetensors",
|
| 938 |
+
"model.layers.79.self_attn.k_proj.weight": "model-00030-of-00031.safetensors",
|
| 939 |
+
"model.layers.79.self_attn.o_proj.weight": "model-00030-of-00031.safetensors",
|
| 940 |
+
"model.layers.79.self_attn.q_proj.bias": "model-00030-of-00031.safetensors",
|
| 941 |
+
"model.layers.79.self_attn.q_proj.weight": "model-00030-of-00031.safetensors",
|
| 942 |
+
"model.layers.79.self_attn.v_proj.bias": "model-00030-of-00031.safetensors",
|
| 943 |
+
"model.layers.79.self_attn.v_proj.weight": "model-00030-of-00031.safetensors",
|
| 944 |
+
"model.layers.8.input_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 945 |
+
"model.layers.8.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
|
| 946 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
|
| 947 |
+
"model.layers.8.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
|
| 948 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
|
| 949 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
|
| 950 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
|
| 951 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
|
| 952 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
|
| 953 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
|
| 954 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
|
| 955 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
|
| 956 |
+
"model.layers.9.input_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 957 |
+
"model.layers.9.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
|
| 958 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
|
| 959 |
+
"model.layers.9.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
|
| 960 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
|
| 961 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
|
| 962 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
|
| 963 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
|
| 964 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
|
| 965 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
|
| 966 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
|
| 967 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
|
| 968 |
+
"model.norm.weight": "model-00030-of-00031.safetensors"
|
| 969 |
+
}
|
| 970 |
+
}
|
20250121165312/hf-593/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
20250121165312/hf-593/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank0.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank10.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank13.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank14.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank18.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
20250121165312/rank21.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|