yujia23 commited on Apr 9, 2024

Commit

2c4ad44

verified ·

1 Parent(s): 5a76c34

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +162 -0
adapter_config.json +34 -0
adapter_model.bin +3 -0
added_tokens.json +5 -0
checkpoint-1130/README.md +202 -0
checkpoint-1130/adapter_config.json +34 -0
checkpoint-1130/adapter_model.safetensors +3 -0
checkpoint-1130/added_tokens.json +5 -0
checkpoint-1130/merges.txt +0 -0
checkpoint-1130/optimizer.pt +3 -0
checkpoint-1130/rng_state_0.pth +3 -0
checkpoint-1130/rng_state_1.pth +3 -0
checkpoint-1130/scheduler.pt +3 -0
checkpoint-1130/special_tokens_map.json +20 -0
checkpoint-1130/tokenizer.json +0 -0
checkpoint-1130/tokenizer_config.json +43 -0
checkpoint-1130/trainer_state.json +0 -0
checkpoint-1130/training_args.bin +3 -0
checkpoint-1130/vocab.json +0 -0
checkpoint-1695/README.md +202 -0
checkpoint-1695/adapter_config.json +34 -0
checkpoint-1695/adapter_model.safetensors +3 -0
checkpoint-1695/added_tokens.json +5 -0
checkpoint-1695/merges.txt +0 -0
checkpoint-1695/optimizer.pt +3 -0
checkpoint-1695/rng_state_0.pth +3 -0
checkpoint-1695/rng_state_1.pth +3 -0
checkpoint-1695/scheduler.pt +3 -0
checkpoint-1695/special_tokens_map.json +20 -0
checkpoint-1695/tokenizer.json +0 -0
checkpoint-1695/tokenizer_config.json +43 -0
checkpoint-1695/trainer_state.json +0 -0
checkpoint-1695/training_args.bin +3 -0
checkpoint-1695/vocab.json +0 -0
checkpoint-565/README.md +202 -0
checkpoint-565/adapter_config.json +34 -0
checkpoint-565/adapter_model.safetensors +3 -0
checkpoint-565/added_tokens.json +5 -0
checkpoint-565/merges.txt +0 -0
checkpoint-565/optimizer.pt +3 -0
checkpoint-565/rng_state_0.pth +3 -0
checkpoint-565/rng_state_1.pth +3 -0
checkpoint-565/scheduler.pt +3 -0
checkpoint-565/special_tokens_map.json +20 -0
checkpoint-565/tokenizer.json +0 -0
checkpoint-565/tokenizer_config.json +43 -0
checkpoint-565/trainer_state.json +4008 -0
checkpoint-565/training_args.bin +3 -0
checkpoint-565/vocab.json +0 -0
config.json +42 -0

README.md ADDED Viewed

	@@ -0,0 +1,162 @@

+---
+license: other
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: Qwen/Qwen1.5-7B
+model-index:
+- name: home/yujia/home/CN_Hateful/trained_models/qwen/CN/toxi/1e-3/
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+# base_model: Qwen/Qwen-7B
+base_model: Qwen/Qwen1.5-7B
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+datasets:
+  # - path: mhenrichsen/alpaca_2k_test
+  - path: /home/yujia/home/CN_Hateful/train_toxiCN_cn.json
+  # - path: /home/yujia/home/CN_Hateful/train_toxiCN.json
+  # - path: /home/yujia/home/CN_Hateful/train.json
+  # - path: /home/yujia/home/CN_Hateful/train_cn.json
+    ds_type: json
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: /home/yujia/home/CN_Hateful/trained_models/qwen/CN/toxi/1e-3/
+# output_dir: /home/yujia/home/CN_Hateful/trained_models/qwen/toxi/1e-5/
+# output_dir: /home/yujia/home/CN_Hateful/trained_models/qwen/cold/3e-4/
+# output_dir: /home/yujia/home/CN_Hateful/trained_models/qwen/CN/cold/3e-4/
+sequence_len: 256  # supports up to 8192
+sample_packing: false
+pad_to_sequence_len:
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.001
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention:
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 20
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+```
+</details><br>
+# home/yujia/home/CN_Hateful/trained_models/qwen/CN/toxi/1e-3/
+This model is a fine-tuned version of [Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.1333
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.001
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 16
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 3
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 3.3182        | 0.0   | 1    | 3.3363          |
+| 0.1432        | 0.25  | 142  | 1.5367          |
+| 0.1667        | 0.5   | 284  | 0.1418          |
+| 0.1329        | 0.75  | 426  | 0.1375          |
+| 0.1372        | 1.0   | 568  | 0.1397          |
+| 0.1369        | 1.26  | 710  | 0.1382          |
+| 0.1537        | 1.51  | 852  | 0.1408          |
+| 0.1204        | 1.76  | 994  | 0.1354          |
+| 0.1262        | 2.01  | 1136 | 0.1343          |
+| 0.12          | 2.26  | 1278 | 0.1380          |
+| 0.146         | 2.51  | 1420 | 0.1335          |
+| 0.1502        | 2.76  | 1562 | 0.1333          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.0.dev0
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-7B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:830e38ae3e35fd023a10add9d0ac7345292caf24c2129e5d06657eaea7b92d92
+size 319977674

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-1130/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: Qwen/Qwen1.5-7B
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-1130/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-7B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1130/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75b2f36c11368d62522d332b027598c1bd25cdc0f0e76c61e91168daaa95eaf8
+size 319876032

checkpoint-1130/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-1130/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1130/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:096c2ec3c1b66b84963de8dc8e04c3153d4303225cdae6d477a7024ec04d471e
+size 160736532

checkpoint-1130/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4901585e9d75e84023ab72e4541020015ec7f9e3a44dd30228bed49938a1bc
+size 14512

checkpoint-1130/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f52b8bbcff4bb55ccbed97b61cc7bef4a35d002ff92406d2e23baa476f0a8d21
+size 14512

checkpoint-1130/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff9ea3978dc9317c468c2c00769c2a61de1c01c1a4ff299627005ac129aa7f6
+size 1064

checkpoint-1130/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1130/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1130/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1130/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1130/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:242a9177bc415e2f72dc78b8f2eb1cd29a0e78da733db42237e6bb4cd1af9c7d
+size 5752

checkpoint-1130/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1695/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: Qwen/Qwen1.5-7B
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-1695/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-7B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1695/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d39246b5530cbaafea081381c21c915516e701ad53556b068ed5cb35cec7f477
+size 319876032

checkpoint-1695/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-1695/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1695/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0049358da0b017afaec8df1d9c4c584032cc7c203245a00b042769b0c83a1c39
+size 160736532

checkpoint-1695/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b55ad0de153a9394c92810ea3c27399952a305bd25451ed430aaf933a9a5e55c
+size 14512

checkpoint-1695/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49270efca7489b54d9139d7c93770e62001cb2f4e09707c29e75b02ffb96afb7
+size 14512

checkpoint-1695/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7323ec8a0c271f5ede6b76f43aedf644821eee5e8de58b5de06dfe7df966802d
+size 1064

checkpoint-1695/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1695/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1695/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1695/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1695/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:242a9177bc415e2f72dc78b8f2eb1cd29a0e78da733db42237e6bb4cd1af9c7d
+size 5752

checkpoint-1695/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-565/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: Qwen/Qwen1.5-7B
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-565/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen1.5-7B",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "q_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-565/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25b0612ae6de3c34b90c5a1d1ceaaffbce4812950fb53caa68724381b1f03748
+size 319876032

checkpoint-565/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

checkpoint-565/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-565/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:853f09c69b65007336bb2ab3c1759f1b22107ff09f84bcfc69cce2df88e5195a
+size 160736532

checkpoint-565/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49a681c33ffd6b61feaf98d05f702d37f2e4cea5ae28ff9fe027ab78959f6d28
+size 14512

checkpoint-565/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:264fc0be051e80523f0b0faf9f50191b6a6d8a450a601ac7a6354029ee14de9c
+size 14512

checkpoint-565/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e16654130fe41ba22580528fce61920c5dca76f31f760c72e1f44964e37bd87c
+size 1064

checkpoint-565/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-565/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-565/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-565/trainer_state.json ADDED Viewed

	@@ -0,0 +1,4008 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9995577178239717,
+  "eval_steps": 142,
+  "global_step": 565,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 4.705155372619629,
+      "learning_rate": 0.0001,
+      "loss": 3.3182,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 3.3362529277801514,
+      "eval_runtime": 14.4366,
+      "eval_samples_per_second": 33.041,
+      "eval_steps_per_second": 8.312,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 4.644563674926758,
+      "learning_rate": 0.0002,
+      "loss": 3.2788,
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.3825764656066895,
+      "learning_rate": 0.0003,
+      "loss": 2.9231,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.904296636581421,
+      "learning_rate": 0.0004,
+      "loss": 1.1824,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.6988284587860107,
+      "learning_rate": 0.0005,
+      "loss": 0.3472,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5742101073265076,
+      "learning_rate": 0.0006,
+      "loss": 0.1478,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6511944532394409,
+      "learning_rate": 0.0007,
+      "loss": 0.1532,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.305083751678467,
+      "learning_rate": 0.0008,
+      "loss": 0.2397,
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 4.7435078620910645,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.434,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6199779510498047,
+      "learning_rate": 0.001,
+      "loss": 0.1743,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.14406554400920868,
+      "learning_rate": 0.0009999991309598973,
+      "loss": 0.1404,
+      "step": 11
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.1965201050043106,
+      "learning_rate": 0.0009999965238426103,
+      "loss": 0.1418,
+      "step": 12
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 26.520109176635742,
+      "learning_rate": 0.0009999921786572016,
+      "loss": 0.2689,
+      "step": 13
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.0870603695511818,
+      "learning_rate": 0.0009999860954187755,
+      "loss": 0.1338,
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.05200817808508873,
+      "learning_rate": 0.0009999782741484788,
+      "loss": 0.1308,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.2145700752735138,
+      "learning_rate": 0.0009999687148734995,
+      "loss": 0.1375,
+      "step": 16
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.19921083748340607,
+      "learning_rate": 0.0009999574176270667,
+      "loss": 0.1388,
+      "step": 17
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.193419337272644,
+      "learning_rate": 0.0009999443824484518,
+      "loss": 0.1978,
+      "step": 18
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.4399484395980835,
+      "learning_rate": 0.0009999296093829671,
+      "loss": 0.1518,
+      "step": 19
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 44.88853073120117,
+      "learning_rate": 0.0009999130984819661,
+      "loss": 0.9033,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.3220385015010834,
+      "learning_rate": 0.0009998948498028434,
+      "loss": 0.1234,
+      "step": 21
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5420748591423035,
+      "learning_rate": 0.0009998748634090344,
+      "loss": 0.1602,
+      "step": 22
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5249865651130676,
+      "learning_rate": 0.0009998531393700149,
+      "loss": 0.1538,
+      "step": 23
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.056158341467380524,
+      "learning_rate": 0.000999829677761301,
+      "loss": 0.1374,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.19818872213363647,
+      "learning_rate": 0.0009998044786644492,
+      "loss": 0.1413,
+      "step": 25
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.27901849150657654,
+      "learning_rate": 0.0009997775421670557,
+      "loss": 0.1395,
+      "step": 26
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.22768354415893555,
+      "learning_rate": 0.0009997488683627558,
+      "loss": 0.1241,
+      "step": 27
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.14878959953784943,
+      "learning_rate": 0.0009997184573512245,
+      "loss": 0.1243,
+      "step": 28
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.0589066743850708,
+      "learning_rate": 0.000999686309238175,
+      "loss": 0.2499,
+      "step": 29
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.11455405503511429,
+      "learning_rate": 0.00099965242413536,
+      "loss": 0.1254,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.16566088795661926,
+      "learning_rate": 0.000999616802160569,
+      "loss": 0.1416,
+      "step": 31
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.3691716194152832,
+      "learning_rate": 0.0009995794434376297,
+      "loss": 0.1465,
+      "step": 32
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.09674070030450821,
+      "learning_rate": 0.000999540348096407,
+      "loss": 0.1373,
+      "step": 33
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5034632086753845,
+      "learning_rate": 0.000999499516272803,
+      "loss": 0.1471,
+      "step": 34
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.26572930812835693,
+      "learning_rate": 0.0009994569481087553,
+      "loss": 0.1424,
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.20631802082061768,
+      "learning_rate": 0.0009994126437522376,
+      "loss": 0.1449,
+      "step": 36
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.11268749833106995,
+      "learning_rate": 0.0009993666033572591,
+      "loss": 0.1403,
+      "step": 37
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.6610996723175049,
+      "learning_rate": 0.0009993188270838635,
+      "loss": 0.1424,
+      "step": 38
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 98.93838500976562,
+      "learning_rate": 0.0009992693150981291,
+      "loss": 2.775,
+      "step": 39
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 32.53168869018555,
+      "learning_rate": 0.0009992180675721671,
+      "loss": 0.6932,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 54.8778076171875,
+      "learning_rate": 0.0009991650846841226,
+      "loss": 5.7008,
+      "step": 41
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.0524775981903076,
+      "learning_rate": 0.000999110366618172,
+      "loss": 0.1623,
+      "step": 42
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.404278427362442,
+      "learning_rate": 0.0009990539135645246,
+      "loss": 0.1427,
+      "step": 43
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.7963409423828125,
+      "learning_rate": 0.0009989957257194198,
+      "loss": 0.174,
+      "step": 44
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.11620022356510162,
+      "learning_rate": 0.0009989358032851284,
+      "loss": 0.1339,
+      "step": 45
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5025681853294373,
+      "learning_rate": 0.00099887414646995,
+      "loss": 0.1558,
+      "step": 46
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 78.1165771484375,
+      "learning_rate": 0.0009988107554882138,
+      "loss": 2.2938,
+      "step": 47
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.08389786630868912,
+      "learning_rate": 0.0009987456305602768,
+      "loss": 0.1409,
+      "step": 48
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 7.123101711273193,
+      "learning_rate": 0.0009986787719125242,
+      "loss": 0.1524,
+      "step": 49
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5341290235519409,
+      "learning_rate": 0.0009986101797773666,
+      "loss": 0.1598,
+      "step": 50
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.05239284038543701,
+      "learning_rate": 0.000998539854393242,
+      "loss": 0.1386,
+      "step": 51
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.0722254291176796,
+      "learning_rate": 0.0009984677960046123,
+      "loss": 0.1385,
+      "step": 52
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.11535236239433289,
+      "learning_rate": 0.000998394004861964,
+      "loss": 0.1369,
+      "step": 53
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7584894299507141,
+      "learning_rate": 0.0009983184812218072,
+      "loss": 0.108,
+      "step": 54
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8361538052558899,
+      "learning_rate": 0.000998241225346674,
+      "loss": 0.1703,
+      "step": 55
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.37683162093162537,
+      "learning_rate": 0.0009981622375051184,
+      "loss": 0.1368,
+      "step": 56
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5335961580276489,
+      "learning_rate": 0.0009980815179717144,
+      "loss": 0.1559,
+      "step": 57
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2806299328804016,
+      "learning_rate": 0.0009979990670270565,
+      "loss": 0.1397,
+      "step": 58
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.4967437982559204,
+      "learning_rate": 0.0009979148849577574,
+      "loss": 0.1543,
+      "step": 59
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10350017994642258,
+      "learning_rate": 0.0009978289720564471,
+      "loss": 0.1367,
+      "step": 60
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 78.35698699951172,
+      "learning_rate": 0.0009977413286217727,
+      "loss": 2.2474,
+      "step": 61
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.15264186263084412,
+      "learning_rate": 0.0009976519549583973,
+      "loss": 0.1311,
+      "step": 62
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.31865784525871277,
+      "learning_rate": 0.0009975608513769975,
+      "loss": 0.1407,
+      "step": 63
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.32891547679901123,
+      "learning_rate": 0.0009974680181942645,
+      "loss": 0.1423,
+      "step": 64
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.15653717517852783,
+      "learning_rate": 0.0009973734557329008,
+      "loss": 0.1365,
+      "step": 65
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3237778842449188,
+      "learning_rate": 0.0009972771643216212,
+      "loss": 0.1407,
+      "step": 66
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.13634416460990906,
+      "learning_rate": 0.0009971791442951496,
+      "loss": 0.1378,
+      "step": 67
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3488883376121521,
+      "learning_rate": 0.0009970793959942197,
+      "loss": 0.1429,
+      "step": 68
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5150622129440308,
+      "learning_rate": 0.0009969779197655725,
+      "loss": 0.1492,
+      "step": 69
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.3482552468776703,
+      "learning_rate": 0.0009968747159619555,
+      "loss": 0.1415,
+      "step": 70
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.22551549971103668,
+      "learning_rate": 0.000996769784942122,
+      "loss": 0.1418,
+      "step": 71
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.20759086310863495,
+      "learning_rate": 0.0009966631270708287,
+      "loss": 0.1366,
+      "step": 72
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 13.050313949584961,
+      "learning_rate": 0.0009965547427188356,
+      "loss": 0.1375,
+      "step": 73
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.18372055888175964,
+      "learning_rate": 0.0009964446322629043,
+      "loss": 0.1285,
+      "step": 74
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.4404817819595337,
+      "learning_rate": 0.000996332796085796,
+      "loss": 0.1501,
+      "step": 75
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.269240379333496,
+      "learning_rate": 0.0009962192345762716,
+      "loss": 0.1346,
+      "step": 76
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 32.20164108276367,
+      "learning_rate": 0.0009961039481290888,
+      "loss": 0.3348,
+      "step": 77
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 82.92976379394531,
+      "learning_rate": 0.0009959869371450021,
+      "loss": 5.8309,
+      "step": 78
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.3416314721107483,
+      "learning_rate": 0.0009958682020307602,
+      "loss": 0.1418,
+      "step": 79
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 31.961870193481445,
+      "learning_rate": 0.0009957477431991053,
+      "loss": 0.1899,
+      "step": 80
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 38.58375930786133,
+      "learning_rate": 0.000995625561068772,
+      "loss": 0.5641,
+      "step": 81
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.32622194290161133,
+      "learning_rate": 0.0009955016560644846,
+      "loss": 0.1144,
+      "step": 82
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 6.264970779418945,
+      "learning_rate": 0.0009953760286169572,
+      "loss": 0.4788,
+      "step": 83
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.07168668508529663,
+      "learning_rate": 0.0009952486791628904,
+      "loss": 0.1326,
+      "step": 84
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 35.18340301513672,
+      "learning_rate": 0.000995119608144972,
+      "loss": 0.3884,
+      "step": 85
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.03896519914269447,
+      "learning_rate": 0.000994988816011873,
+      "loss": 0.1249,
+      "step": 86
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 14.499520301818848,
+      "learning_rate": 0.000994856303218248,
+      "loss": 0.3756,
+      "step": 87
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.3134947419166565,
+      "learning_rate": 0.000994722070224733,
+      "loss": 0.1539,
+      "step": 88
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 117.39696502685547,
+      "learning_rate": 0.000994586117497943,
+      "loss": 0.5885,
+      "step": 89
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 37.93465805053711,
+      "learning_rate": 0.0009944484455104716,
+      "loss": 0.7709,
+      "step": 90
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 236.63330078125,
+      "learning_rate": 0.0009943090547408888,
+      "loss": 6.0182,
+      "step": 91
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.1088515520095825,
+      "learning_rate": 0.0009941679456737394,
+      "loss": 0.1931,
+      "step": 92
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.11310256272554398,
+      "learning_rate": 0.0009940251187995411,
+      "loss": 0.1293,
+      "step": 93
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6143047213554382,
+      "learning_rate": 0.0009938805746147828,
+      "loss": 0.2364,
+      "step": 94
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2461577206850052,
+      "learning_rate": 0.0009937343136219232,
+      "loss": 0.1504,
+      "step": 95
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 97.17162322998047,
+      "learning_rate": 0.0009935863363293895,
+      "loss": 5.764,
+      "step": 96
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5417380928993225,
+      "learning_rate": 0.000993436643251574,
+      "loss": 0.1576,
+      "step": 97
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2737255096435547,
+      "learning_rate": 0.0009932852349088341,
+      "loss": 0.1437,
+      "step": 98
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 138.00778198242188,
+      "learning_rate": 0.0009931321118274896,
+      "loss": 4.1331,
+      "step": 99
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 46.4688606262207,
+      "learning_rate": 0.0009929772745398205,
+      "loss": 0.6178,
+      "step": 100
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.49907386302948,
+      "learning_rate": 0.0009928207235840663,
+      "loss": 0.1445,
+      "step": 101
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.33814460039138794,
+      "learning_rate": 0.0009926624595044233,
+      "loss": 0.139,
+      "step": 102
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.3241071403026581,
+      "learning_rate": 0.0009925024828510427,
+      "loss": 0.1404,
+      "step": 103
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 78.4036865234375,
+      "learning_rate": 0.000992340794180029,
+      "loss": 1.2663,
+      "step": 104
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.113776445388794,
+      "learning_rate": 0.000992177394053438,
+      "loss": 0.162,
+      "step": 105
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 16.512048721313477,
+      "learning_rate": 0.0009920122830392748,
+      "loss": 3.3373,
+      "step": 106
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 111.53176879882812,
+      "learning_rate": 0.0009918454617114918,
+      "loss": 2.3969,
+      "step": 107
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 14.91741943359375,
+      "learning_rate": 0.0009916769306499865,
+      "loss": 1.8837,
+      "step": 108
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 61.30055618286133,
+      "learning_rate": 0.0009915066904406,
+      "loss": 10.4922,
+      "step": 109
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6948704123497009,
+      "learning_rate": 0.0009913347416751147,
+      "loss": 0.1536,
+      "step": 110
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.7721084356307983,
+      "learning_rate": 0.000991161084951252,
+      "loss": 0.1356,
+      "step": 111
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.24614596366882324,
+      "learning_rate": 0.0009909857208726704,
+      "loss": 0.1339,
+      "step": 112
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 7.189969062805176,
+      "learning_rate": 0.0009908086500489638,
+      "loss": 0.2551,
+      "step": 113
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8675662279129028,
+      "learning_rate": 0.0009906298730956585,
+      "loss": 0.1668,
+      "step": 114
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.605249285697937,
+      "learning_rate": 0.0009904493906342123,
+      "loss": 0.1478,
+      "step": 115
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8765722513198853,
+      "learning_rate": 0.0009902672032920106,
+      "loss": 0.1598,
+      "step": 116
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6021157503128052,
+      "learning_rate": 0.0009900833117023665,
+      "loss": 0.1506,
+      "step": 117
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.28180792927742004,
+      "learning_rate": 0.000989897716504516,
+      "loss": 0.1389,
+      "step": 118
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.21730898320674896,
+      "learning_rate": 0.0009897104183436184,
+      "loss": 0.1377,
+      "step": 119
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.977118730545044,
+      "learning_rate": 0.0009895214178707516,
+      "loss": 0.1698,
+      "step": 120
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.674729585647583,
+      "learning_rate": 0.0009893307157429118,
+      "loss": 0.1559,
+      "step": 121
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.9852035045623779,
+      "learning_rate": 0.0009891383126230102,
+      "loss": 0.2027,
+      "step": 122
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.36689773201942444,
+      "learning_rate": 0.0009889442091798712,
+      "loss": 0.1498,
+      "step": 123
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.104621522128582,
+      "learning_rate": 0.000988748406088229,
+      "loss": 0.1379,
+      "step": 124
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 74.17496490478516,
+      "learning_rate": 0.0009885509040287268,
+      "loss": 0.7724,
+      "step": 125
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.2943025827407837,
+      "learning_rate": 0.0009883517036879132,
+      "loss": 0.2643,
+      "step": 126
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.828774094581604,
+      "learning_rate": 0.000988150805758241,
+      "loss": 0.1852,
+      "step": 127
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.13165877759456635,
+      "learning_rate": 0.0009879482109380632,
+      "loss": 0.1429,
+      "step": 128
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.662426769733429,
+      "learning_rate": 0.0009877439199316323,
+      "loss": 0.1643,
+      "step": 129
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.6256189942359924,
+      "learning_rate": 0.0009875379334490962,
+      "loss": 0.157,
+      "step": 130
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5049256086349487,
+      "learning_rate": 0.0009873302522064972,
+      "loss": 0.1484,
+      "step": 131
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.4133671522140503,
+      "learning_rate": 0.0009871208769257685,
+      "loss": 0.1736,
+      "step": 132
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7930824756622314,
+      "learning_rate": 0.0009869098083347323,
+      "loss": 0.1543,
+      "step": 133
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5717449188232422,
+      "learning_rate": 0.0009866970471670965,
+      "loss": 0.1338,
+      "step": 134
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.582081139087677,
+      "learning_rate": 0.0009864825941624537,
+      "loss": 0.1692,
+      "step": 135
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 10.226588249206543,
+      "learning_rate": 0.0009862664500662763,
+      "loss": 0.2425,
+      "step": 136
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.1186953783035278,
+      "learning_rate": 0.0009860486156299164,
+      "loss": 0.2052,
+      "step": 137
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2953661382198334,
+      "learning_rate": 0.000985829091610601,
+      "loss": 0.1408,
+      "step": 138
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8647088408470154,
+      "learning_rate": 0.000985607878771431,
+      "loss": 0.1571,
+      "step": 139
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.41964420676231384,
+      "learning_rate": 0.0009853849778813776,
+      "loss": 0.1477,
+      "step": 140
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.25675931572914124,
+      "learning_rate": 0.0009851603897152803,
+      "loss": 0.1398,
+      "step": 141
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2311631143093109,
+      "learning_rate": 0.0009849341150538434,
+      "loss": 0.1432,
+      "step": 142
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.5366541147232056,
+      "eval_runtime": 14.6962,
+      "eval_samples_per_second": 32.457,
+      "eval_steps_per_second": 8.165,
+      "step": 142
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 41.83562469482422,
+      "learning_rate": 0.0009847061546836339,
+      "loss": 1.1525,
+      "step": 143
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.27440375089645386,
+      "learning_rate": 0.0009844765093970787,
+      "loss": 0.1452,
+      "step": 144
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.27643319964408875,
+      "learning_rate": 0.0009842451799924616,
+      "loss": 0.1069,
+      "step": 145
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21519601345062256,
+      "learning_rate": 0.0009840121672739207,
+      "loss": 0.1358,
+      "step": 146
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.4073689877986908,
+      "learning_rate": 0.0009837774720514456,
+      "loss": 0.1545,
+      "step": 147
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.13685636222362518,
+      "learning_rate": 0.0009835410951408747,
+      "loss": 0.1259,
+      "step": 148
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.07474564015865326,
+      "learning_rate": 0.000983303037363892,
+      "loss": 0.1356,
+      "step": 149
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.45116662979125977,
+      "learning_rate": 0.0009830632995480241,
+      "loss": 0.1379,
+      "step": 150
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.1297813504934311,
+      "learning_rate": 0.0009828218825266388,
+      "loss": 0.1343,
+      "step": 151
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5846492052078247,
+      "learning_rate": 0.00098257878713894,
+      "loss": 0.1563,
+      "step": 152
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.38457778096199036,
+      "learning_rate": 0.0009823340142299662,
+      "loss": 0.1477,
+      "step": 153
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.09184035658836365,
+      "learning_rate": 0.0009820875646505873,
+      "loss": 0.1376,
+      "step": 154
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5166211128234863,
+      "learning_rate": 0.0009818394392575019,
+      "loss": 0.1498,
+      "step": 155
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.2788640260696411,
+      "learning_rate": 0.0009815896389132332,
+      "loss": 0.1434,
+      "step": 156
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3762676417827606,
+      "learning_rate": 0.0009813381644861276,
+      "loss": 0.1482,
+      "step": 157
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.3615610897541046,
+      "learning_rate": 0.0009810850168503506,
+      "loss": 0.1312,
+      "step": 158
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.03483320027589798,
+      "learning_rate": 0.0009808301968858837,
+      "loss": 0.1239,
+      "step": 159
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5616227984428406,
+      "learning_rate": 0.0009805737054785222,
+      "loss": 0.1881,
+      "step": 160
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.029542161151766777,
+      "learning_rate": 0.000980315543519871,
+      "loss": 0.1254,
+      "step": 161
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.142581045627594,
+      "learning_rate": 0.0009800557119073433,
+      "loss": 0.1258,
+      "step": 162
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.7289375066757202,
+      "learning_rate": 0.0009797942115441546,
+      "loss": 0.1526,
+      "step": 163
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.6975064873695374,
+      "learning_rate": 0.0009795310433393224,
+      "loss": 0.1487,
+      "step": 164
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.3072260618209839,
+      "learning_rate": 0.0009792662082076617,
+      "loss": 0.1712,
+      "step": 165
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2993917465209961,
+      "learning_rate": 0.000978999707069782,
+      "loss": 0.1424,
+      "step": 166
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.3258236050605774,
+      "learning_rate": 0.0009787315408520839,
+      "loss": 0.135,
+      "step": 167
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.26566603779792786,
+      "learning_rate": 0.000978461710486756,
+      "loss": 0.1441,
+      "step": 168
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.1709599494934082,
+      "learning_rate": 0.0009781902169117718,
+      "loss": 0.2084,
+      "step": 169
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6554279923439026,
+      "learning_rate": 0.000977917061070887,
+      "loss": 0.1634,
+      "step": 170
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1635073721408844,
+      "learning_rate": 0.000977642243913635,
+      "loss": 0.1371,
+      "step": 171
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4419834017753601,
+      "learning_rate": 0.0009773657663953242,
+      "loss": 0.1523,
+      "step": 172
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.839259147644043,
+      "learning_rate": 0.000977087629477035,
+      "loss": 0.1628,
+      "step": 173
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.1979222148656845,
+      "learning_rate": 0.0009768078341256155,
+      "loss": 0.1367,
+      "step": 174
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2939910888671875,
+      "learning_rate": 0.0009765263813136795,
+      "loss": 0.1349,
+      "step": 175
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.19882674515247345,
+      "learning_rate": 0.0009762432720196024,
+      "loss": 0.1424,
+      "step": 176
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.07146954536437988,
+      "learning_rate": 0.000975958507227517,
+      "loss": 0.1237,
+      "step": 177
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5031868815422058,
+      "learning_rate": 0.0009756720879273117,
+      "loss": 0.1592,
+      "step": 178
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.14860151708126068,
+      "learning_rate": 0.0009753840151146258,
+      "loss": 0.1396,
+      "step": 179
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.10280521959066391,
+      "learning_rate": 0.0009750942897908468,
+      "loss": 0.1333,
+      "step": 180
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.4652903974056244,
+      "learning_rate": 0.0009748029129631061,
+      "loss": 0.1421,
+      "step": 181
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3985591530799866,
+      "learning_rate": 0.0009745098856442768,
+      "loss": 0.1459,
+      "step": 182
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.20321591198444366,
+      "learning_rate": 0.0009742152088529683,
+      "loss": 0.1381,
+      "step": 183
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.7694361805915833,
+      "learning_rate": 0.0009739188836135246,
+      "loss": 0.1676,
+      "step": 184
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.04469340294599533,
+      "learning_rate": 0.0009736209109560201,
+      "loss": 0.136,
+      "step": 185
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.08576061576604843,
+      "learning_rate": 0.0009733212919162549,
+      "loss": 0.1408,
+      "step": 186
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.042906519025564194,
+      "learning_rate": 0.0009730200275357535,
+      "loss": 0.1364,
+      "step": 187
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.30054494738578796,
+      "learning_rate": 0.0009727171188617588,
+      "loss": 0.1539,
+      "step": 188
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.05149005725979805,
+      "learning_rate": 0.0009724125669472299,
+      "loss": 0.1352,
+      "step": 189
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1381620466709137,
+      "learning_rate": 0.0009721063728508383,
+      "loss": 0.1409,
+      "step": 190
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.37344205379486084,
+      "learning_rate": 0.0009717985376369639,
+      "loss": 0.1299,
+      "step": 191
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1037706583738327,
+      "learning_rate": 0.0009714890623756912,
+      "loss": 0.1341,
+      "step": 192
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.14189712703227997,
+      "learning_rate": 0.0009711779481428056,
+      "loss": 0.1418,
+      "step": 193
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.15108801424503326,
+      "learning_rate": 0.0009708651960197903,
+      "loss": 0.142,
+      "step": 194
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.037045519798994064,
+      "learning_rate": 0.0009705508070938218,
+      "loss": 0.1315,
+      "step": 195
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.23301652073860168,
+      "learning_rate": 0.0009702347824577666,
+      "loss": 0.1396,
+      "step": 196
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.08476269990205765,
+      "learning_rate": 0.0009699171232101768,
+      "loss": 0.1392,
+      "step": 197
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.4222690463066101,
+      "learning_rate": 0.000969597830455287,
+      "loss": 0.1463,
+      "step": 198
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.3234136402606964,
+      "learning_rate": 0.0009692769053030099,
+      "loss": 0.1257,
+      "step": 199
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.04025443643331528,
+      "learning_rate": 0.0009689543488689332,
+      "loss": 0.1303,
+      "step": 200
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.07074520736932755,
+      "learning_rate": 0.0009686301622743144,
+      "loss": 0.1289,
+      "step": 201
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.0788850486278534,
+      "learning_rate": 0.0009683043466460782,
+      "loss": 0.1236,
+      "step": 202
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.525541365146637,
+      "learning_rate": 0.000967976903116812,
+      "loss": 0.1564,
+      "step": 203
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6145509481430054,
+      "learning_rate": 0.0009676478328247623,
+      "loss": 0.156,
+      "step": 204
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.230132058262825,
+      "learning_rate": 0.0009673171369138296,
+      "loss": 0.1425,
+      "step": 205
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.03262978792190552,
+      "learning_rate": 0.0009669848165335666,
+      "loss": 0.1297,
+      "step": 206
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.0462469644844532,
+      "learning_rate": 0.0009666508728391718,
+      "loss": 0.1177,
+      "step": 207
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.06880385428667068,
+      "learning_rate": 0.0009663153069914874,
+      "loss": 0.1207,
+      "step": 208
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.4248260259628296,
+      "learning_rate": 0.000965978120156994,
+      "loss": 0.1571,
+      "step": 209
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.060492075979709625,
+      "learning_rate": 0.0009656393135078068,
+      "loss": 0.1219,
+      "step": 210
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.12135621905326843,
+      "learning_rate": 0.0009652988882216725,
+      "loss": 0.1323,
+      "step": 211
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.252119243144989,
+      "learning_rate": 0.0009649568454819637,
+      "loss": 0.1366,
+      "step": 212
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5283567905426025,
+      "learning_rate": 0.0009646131864776761,
+      "loss": 0.1246,
+      "step": 213
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 2.224665880203247,
+      "learning_rate": 0.0009642679124034233,
+      "loss": 0.2582,
+      "step": 214
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.9277523756027222,
+      "learning_rate": 0.0009639210244594335,
+      "loss": 0.2131,
+      "step": 215
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5668452978134155,
+      "learning_rate": 0.0009635725238515446,
+      "loss": 0.141,
+      "step": 216
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.13912492990493774,
+      "learning_rate": 0.000963222411791201,
+      "loss": 0.1418,
+      "step": 217
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.39307814836502075,
+      "learning_rate": 0.0009628706894954479,
+      "loss": 0.1477,
+      "step": 218
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.26248928904533386,
+      "learning_rate": 0.000962517358186929,
+      "loss": 0.1315,
+      "step": 219
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.2875257730484009,
+      "learning_rate": 0.0009621624190938803,
+      "loss": 0.1321,
+      "step": 220
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6386964917182922,
+      "learning_rate": 0.0009618058734501269,
+      "loss": 0.1668,
+      "step": 221
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.16165001690387726,
+      "learning_rate": 0.0009614477224950789,
+      "loss": 0.1272,
+      "step": 222
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6959558129310608,
+      "learning_rate": 0.0009610879674737262,
+      "loss": 0.1381,
+      "step": 223
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1701437532901764,
+      "learning_rate": 0.0009607266096366352,
+      "loss": 0.1366,
+      "step": 224
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2511409819126129,
+      "learning_rate": 0.0009603636502399437,
+      "loss": 0.126,
+      "step": 225
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.04554220288991928,
+      "learning_rate": 0.0009599990905453566,
+      "loss": 0.1321,
+      "step": 226
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.3964705765247345,
+      "learning_rate": 0.000959632931820142,
+      "loss": 0.1383,
+      "step": 227
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10925984382629395,
+      "learning_rate": 0.0009592651753371264,
+      "loss": 0.1226,
+      "step": 228
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.19012318551540375,
+      "learning_rate": 0.0009588958223746903,
+      "loss": 0.1255,
+      "step": 229
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.23432157933712006,
+      "learning_rate": 0.0009585248742167639,
+      "loss": 0.1152,
+      "step": 230
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.1737753301858902,
+      "learning_rate": 0.0009581523321528223,
+      "loss": 0.1468,
+      "step": 231
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2625434100627899,
+      "learning_rate": 0.0009577781974778817,
+      "loss": 0.1296,
+      "step": 232
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.3056884706020355,
+      "learning_rate": 0.000957402471492494,
+      "loss": 0.1574,
+      "step": 233
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.4111999273300171,
+      "learning_rate": 0.0009570251555027432,
+      "loss": 0.1434,
+      "step": 234
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.056673482060432434,
+      "learning_rate": 0.0009566462508202401,
+      "loss": 0.1337,
+      "step": 235
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.3861597180366516,
+      "learning_rate": 0.0009562657587621184,
+      "loss": 0.1609,
+      "step": 236
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.35893362760543823,
+      "learning_rate": 0.0009558836806510292,
+      "loss": 0.1189,
+      "step": 237
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.40538331866264343,
+      "learning_rate": 0.0009555000178151374,
+      "loss": 0.1504,
+      "step": 238
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 81.36141967773438,
+      "learning_rate": 0.0009551147715881167,
+      "loss": 4.7235,
+      "step": 239
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.21178042888641357,
+      "learning_rate": 0.0009547279433091446,
+      "loss": 0.1139,
+      "step": 240
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.27380529046058655,
+      "learning_rate": 0.0009543395343228983,
+      "loss": 0.1504,
+      "step": 241
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 41.42683410644531,
+      "learning_rate": 0.0009539495459795498,
+      "loss": 1.2477,
+      "step": 242
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.14853385090827942,
+      "learning_rate": 0.0009535579796347612,
+      "loss": 0.1343,
+      "step": 243
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.3484509289264679,
+      "learning_rate": 0.0009531648366496798,
+      "loss": 0.15,
+      "step": 244
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.20152732729911804,
+      "learning_rate": 0.0009527701183909336,
+      "loss": 0.1399,
+      "step": 245
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 80.84031677246094,
+      "learning_rate": 0.000952373826230627,
+      "loss": 3.1939,
+      "step": 246
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 15.475607872009277,
+      "learning_rate": 0.0009519759615463346,
+      "loss": 3.3935,
+      "step": 247
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 77.19477081298828,
+      "learning_rate": 0.0009515765257210979,
+      "loss": 6.5034,
+      "step": 248
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.1174071803689003,
+      "learning_rate": 0.0009511755201434205,
+      "loss": 0.1212,
+      "step": 249
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 16.503982543945312,
+      "learning_rate": 0.0009507729462072614,
+      "loss": 0.3753,
+      "step": 250
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 76.65412902832031,
+      "learning_rate": 0.0009503688053120326,
+      "loss": 0.9386,
+      "step": 251
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 94.82160186767578,
+      "learning_rate": 0.0009499630988625925,
+      "loss": 4.7449,
+      "step": 252
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2721010148525238,
+      "learning_rate": 0.0009495558282692421,
+      "loss": 0.1358,
+      "step": 253
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5150814056396484,
+      "learning_rate": 0.0009491469949477187,
+      "loss": 0.1622,
+      "step": 254
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 51.050167083740234,
+      "learning_rate": 0.0009487366003191931,
+      "loss": 0.7818,
+      "step": 255
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 11.698090553283691,
+      "learning_rate": 0.0009483246458102625,
+      "loss": 0.3862,
+      "step": 256
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.648543655872345,
+      "learning_rate": 0.0009479111328529472,
+      "loss": 0.1884,
+      "step": 257
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.745293140411377,
+      "learning_rate": 0.0009474960628846843,
+      "loss": 0.1562,
+      "step": 258
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.17890043556690216,
+      "learning_rate": 0.0009470794373483235,
+      "loss": 0.1425,
+      "step": 259
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.5058090686798096,
+      "learning_rate": 0.0009466612576921223,
+      "loss": 0.17,
+      "step": 260
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.3177820444107056,
+      "learning_rate": 0.00094624152536974,
+      "loss": 0.15,
+      "step": 261
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.49652573466300964,
+      "learning_rate": 0.0009458202418402337,
+      "loss": 0.145,
+      "step": 262
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 11.423394203186035,
+      "learning_rate": 0.0009453974085680526,
+      "loss": 0.349,
+      "step": 263
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.5422337055206299,
+      "learning_rate": 0.0009449730270230326,
+      "loss": 0.211,
+      "step": 264
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 103.68435668945312,
+      "learning_rate": 0.0009445470986803921,
+      "loss": 17.4069,
+      "step": 265
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 54.51758575439453,
+      "learning_rate": 0.0009441196250207267,
+      "loss": 15.685,
+      "step": 266
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 14.596623420715332,
+      "learning_rate": 0.0009436906075300032,
+      "loss": 0.791,
+      "step": 267
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 3.3164780139923096,
+      "learning_rate": 0.000943260047699555,
+      "loss": 0.3611,
+      "step": 268
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.3087855577468872,
+      "learning_rate": 0.0009428279470260776,
+      "loss": 0.1332,
+      "step": 269
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.1544523239135742,
+      "learning_rate": 0.0009423943070116219,
+      "loss": 0.2405,
+      "step": 270
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.27010253071784973,
+      "learning_rate": 0.00094195912916359,
+      "loss": 0.1241,
+      "step": 271
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2287709265947342,
+      "learning_rate": 0.0009415224149947306,
+      "loss": 0.1366,
+      "step": 272
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.5216432809829712,
+      "learning_rate": 0.0009410841660231316,
+      "loss": 0.1641,
+      "step": 273
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.3091949224472046,
+      "learning_rate": 0.0009406443837722167,
+      "loss": 0.2524,
+      "step": 274
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.11813609302043915,
+      "learning_rate": 0.0009402030697707398,
+      "loss": 0.1353,
+      "step": 275
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.3709551095962524,
+      "learning_rate": 0.000939760225552779,
+      "loss": 0.2714,
+      "step": 276
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 8.527563095092773,
+      "learning_rate": 0.0009393158526577322,
+      "loss": 0.1955,
+      "step": 277
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 21.874027252197266,
+      "learning_rate": 0.0009388699526303105,
+      "loss": 0.2398,
+      "step": 278
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 51.793731689453125,
+      "learning_rate": 0.0009384225270205339,
+      "loss": 1.3069,
+      "step": 279
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.6711062788963318,
+      "learning_rate": 0.0009379735773837259,
+      "loss": 0.1664,
+      "step": 280
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 5.93789005279541,
+      "learning_rate": 0.0009375231052805072,
+      "loss": 0.2455,
+      "step": 281
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 62.527198791503906,
+      "learning_rate": 0.0009370711122767912,
+      "loss": 6.6447,
+      "step": 282
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 22.35348129272461,
+      "learning_rate": 0.000936617599943778,
+      "loss": 2.5015,
+      "step": 283
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7277780175209045,
+      "learning_rate": 0.0009361625698579493,
+      "loss": 0.1667,
+      "step": 284
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 0.14179374277591705,
+      "eval_runtime": 14.7139,
+      "eval_samples_per_second": 32.418,
+      "eval_steps_per_second": 8.156,
+      "step": 284
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.26271358132362366,
+      "learning_rate": 0.0009357060236010625,
+      "loss": 0.1429,
+      "step": 285
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 21.24464988708496,
+      "learning_rate": 0.0009352479627601457,
+      "loss": 2.0706,
+      "step": 286
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 6.5764265060424805,
+      "learning_rate": 0.0009347883889274922,
+      "loss": 0.3337,
+      "step": 287
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6868380904197693,
+      "learning_rate": 0.0009343273037006539,
+      "loss": 0.1994,
+      "step": 288
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.9018234610557556,
+      "learning_rate": 0.0009338647086824372,
+      "loss": 0.1908,
+      "step": 289
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.7751502990722656,
+      "learning_rate": 0.0009334006054808966,
+      "loss": 0.2028,
+      "step": 290
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.5386408567428589,
+      "learning_rate": 0.0009329349957093293,
+      "loss": 0.1853,
+      "step": 291
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.4171103239059448,
+      "learning_rate": 0.0009324678809862695,
+      "loss": 0.3597,
+      "step": 292
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.4105970561504364,
+      "learning_rate": 0.0009319992629354827,
+      "loss": 0.1344,
+      "step": 293
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.26628127694129944,
+      "learning_rate": 0.000931529143185961,
+      "loss": 0.1453,
+      "step": 294
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 14.981964111328125,
+      "learning_rate": 0.0009310575233719154,
+      "loss": 0.2563,
+      "step": 295
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6945788264274597,
+      "learning_rate": 0.0009305844051327725,
+      "loss": 0.1229,
+      "step": 296
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 31.034496307373047,
+      "learning_rate": 0.000930109790113167,
+      "loss": 1.2974,
+      "step": 297
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.5794603824615479,
+      "learning_rate": 0.0009296336799629368,
+      "loss": 0.22,
+      "step": 298
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.33219394087791443,
+      "learning_rate": 0.0009291560763371172,
+      "loss": 0.1262,
+      "step": 299
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.597118377685547,
+      "learning_rate": 0.000928676980895935,
+      "loss": 0.4026,
+      "step": 300
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 13.547090530395508,
+      "learning_rate": 0.0009281963953048029,
+      "loss": 1.3086,
+      "step": 301
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.289302945137024,
+      "learning_rate": 0.0009277143212343134,
+      "loss": 0.2215,
+      "step": 302
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.2176313400268555,
+      "learning_rate": 0.0009272307603602334,
+      "loss": 0.15,
+      "step": 303
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 4.436944007873535,
+      "learning_rate": 0.0009267457143634979,
+      "loss": 0.514,
+      "step": 304
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 29.960241317749023,
+      "learning_rate": 0.0009262591849302047,
+      "loss": 3.5389,
+      "step": 305
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 5.514049530029297,
+      "learning_rate": 0.0009257711737516082,
+      "loss": 0.2902,
+      "step": 306
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.331019401550293,
+      "learning_rate": 0.0009252816825241135,
+      "loss": 0.2775,
+      "step": 307
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.5708584189414978,
+      "learning_rate": 0.0009247907129492707,
+      "loss": 0.1438,
+      "step": 308
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.16607928276062,
+      "learning_rate": 0.0009242982667337685,
+      "loss": 0.2383,
+      "step": 309
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.5346423387527466,
+      "learning_rate": 0.0009238043455894293,
+      "loss": 0.1793,
+      "step": 310
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.6200052499771118,
+      "learning_rate": 0.000923308951233202,
+      "loss": 0.1473,
+      "step": 311
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 64.87612915039062,
+      "learning_rate": 0.0009228120853871572,
+      "loss": 0.8875,
+      "step": 312
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.077471137046814,
+      "learning_rate": 0.0009223137497784797,
+      "loss": 0.2114,
+      "step": 313
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 3.5934722423553467,
+      "learning_rate": 0.0009218139461394644,
+      "loss": 0.2852,
+      "step": 314
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.10276800394058228,
+      "learning_rate": 0.0009213126762075088,
+      "loss": 0.1365,
+      "step": 315
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 3.9422831535339355,
+      "learning_rate": 0.0009208099417251077,
+      "loss": 0.2949,
+      "step": 316
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7574914693832397,
+      "learning_rate": 0.0009203057444398468,
+      "loss": 0.2621,
+      "step": 317
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.29479530453681946,
+      "learning_rate": 0.0009198000861043967,
+      "loss": 0.1341,
+      "step": 318
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5362827181816101,
+      "learning_rate": 0.0009192929684765068,
+      "loss": 0.1398,
+      "step": 319
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8159481287002563,
+      "learning_rate": 0.0009187843933189994,
+      "loss": 0.1863,
+      "step": 320
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.9413295388221741,
+      "learning_rate": 0.0009182743623997634,
+      "loss": 0.2104,
+      "step": 321
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.5306220650672913,
+      "learning_rate": 0.0009177628774917479,
+      "loss": 0.1537,
+      "step": 322
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8887706398963928,
+      "learning_rate": 0.0009172499403729567,
+      "loss": 0.1963,
+      "step": 323
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8467744588851929,
+      "learning_rate": 0.0009167355528264414,
+      "loss": 0.204,
+      "step": 324
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.19867151975631714,
+      "learning_rate": 0.0009162197166402956,
+      "loss": 0.1407,
+      "step": 325
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.13638383150100708,
+      "learning_rate": 0.0009157024336076487,
+      "loss": 0.1408,
+      "step": 326
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2027496099472046,
+      "learning_rate": 0.0009151837055266594,
+      "loss": 0.1444,
+      "step": 327
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.370151549577713,
+      "learning_rate": 0.0009146635342005098,
+      "loss": 0.158,
+      "step": 328
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.3114052414894104,
+      "learning_rate": 0.000914141921437399,
+      "loss": 0.1464,
+      "step": 329
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.15394961833953857,
+      "learning_rate": 0.0009136188690505362,
+      "loss": 0.1341,
+      "step": 330
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.46498528122901917,
+      "learning_rate": 0.0009130943788581359,
+      "loss": 0.1426,
+      "step": 331
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.28067877888679504,
+      "learning_rate": 0.00091256845268341,
+      "loss": 0.1409,
+      "step": 332
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.061186857521533966,
+      "learning_rate": 0.0009120410923545619,
+      "loss": 0.1401,
+      "step": 333
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.26736098527908325,
+      "learning_rate": 0.0009115122997047811,
+      "loss": 0.1467,
+      "step": 334
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5139696598052979,
+      "learning_rate": 0.0009109820765722356,
+      "loss": 0.1585,
+      "step": 335
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.40007275342941284,
+      "learning_rate": 0.000910450424800066,
+      "loss": 0.1473,
+      "step": 336
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.66825270652771,
+      "learning_rate": 0.0009099173462363792,
+      "loss": 0.1572,
+      "step": 337
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5313024520874023,
+      "learning_rate": 0.0009093828427342418,
+      "loss": 0.1555,
+      "step": 338
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.4224655330181122,
+      "learning_rate": 0.0009088469161516735,
+      "loss": 0.1429,
+      "step": 339
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.03462248668074608,
+      "learning_rate": 0.0009083095683516414,
+      "loss": 0.1325,
+      "step": 340
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.542322039604187,
+      "learning_rate": 0.0009077708012020524,
+      "loss": 0.1755,
+      "step": 341
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.2164747267961502,
+      "learning_rate": 0.0009072306165757476,
+      "loss": 0.1458,
+      "step": 342
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.27414461970329285,
+      "learning_rate": 0.0009066890163504955,
+      "loss": 0.1512,
+      "step": 343
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.1911482959985733,
+      "learning_rate": 0.0009061460024089853,
+      "loss": 0.1185,
+      "step": 344
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.1287711262702942,
+      "learning_rate": 0.0009056015766388205,
+      "loss": 0.1372,
+      "step": 345
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.18598809838294983,
+      "learning_rate": 0.0009050557409325125,
+      "loss": 0.1341,
+      "step": 346
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.18694853782653809,
+      "learning_rate": 0.0009045084971874737,
+      "loss": 0.141,
+      "step": 347
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.06479912996292114,
+      "learning_rate": 0.0009039598473060113,
+      "loss": 0.1368,
+      "step": 348
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.17768733203411102,
+      "learning_rate": 0.0009034097931953201,
+      "loss": 0.1381,
+      "step": 349
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.28938984870910645,
+      "learning_rate": 0.0009028583367674765,
+      "loss": 0.1365,
+      "step": 350
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.2924034893512726,
+      "learning_rate": 0.0009023054799394316,
+      "loss": 0.1282,
+      "step": 351
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.28439652919769287,
+      "learning_rate": 0.0009017512246330042,
+      "loss": 0.151,
+      "step": 352
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.14329224824905396,
+      "learning_rate": 0.0009011955727748749,
+      "loss": 0.1419,
+      "step": 353
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.15245947241783142,
+      "learning_rate": 0.0009006385262965785,
+      "loss": 0.1163,
+      "step": 354
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.052399642765522,
+      "learning_rate": 0.000900080087134498,
+      "loss": 0.1241,
+      "step": 355
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.030301153659820557,
+      "learning_rate": 0.0008995202572298575,
+      "loss": 0.1232,
+      "step": 356
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.41738417744636536,
+      "learning_rate": 0.0008989590385287155,
+      "loss": 0.1675,
+      "step": 357
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.19307875633239746,
+      "learning_rate": 0.0008983964329819583,
+      "loss": 0.1328,
+      "step": 358
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.05682377517223358,
+      "learning_rate": 0.000897832442545293,
+      "loss": 0.1322,
+      "step": 359
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.15418089926242828,
+      "learning_rate": 0.0008972670691792409,
+      "loss": 0.1414,
+      "step": 360
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.07167459279298782,
+      "learning_rate": 0.0008967003148491304,
+      "loss": 0.1414,
+      "step": 361
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2866109609603882,
+      "learning_rate": 0.0008961321815250904,
+      "loss": 0.1381,
+      "step": 362
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.281264990568161,
+      "learning_rate": 0.0008955626711820438,
+      "loss": 0.1365,
+      "step": 363
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.19263768196105957,
+      "learning_rate": 0.0008949917857996997,
+      "loss": 0.1394,
+      "step": 364
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.30531641840934753,
+      "learning_rate": 0.0008944195273625471,
+      "loss": 0.1478,
+      "step": 365
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.16229306161403656,
+      "learning_rate": 0.0008938458978598483,
+      "loss": 0.1412,
+      "step": 366
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.09315463900566101,
+      "learning_rate": 0.0008932708992856315,
+      "loss": 0.1397,
+      "step": 367
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.04228806868195534,
+      "learning_rate": 0.0008926945336386838,
+      "loss": 0.1383,
+      "step": 368
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2209407389163971,
+      "learning_rate": 0.0008921168029225448,
+      "loss": 0.1434,
+      "step": 369
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.04254443198442459,
+      "learning_rate": 0.0008915377091454992,
+      "loss": 0.1326,
+      "step": 370
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.09651175886392593,
+      "learning_rate": 0.0008909572543205698,
+      "loss": 0.134,
+      "step": 371
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2821654975414276,
+      "learning_rate": 0.0008903754404655105,
+      "loss": 0.1498,
+      "step": 372
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.43042680621147156,
+      "learning_rate": 0.0008897922696027998,
+      "loss": 0.1571,
+      "step": 373
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.06591568142175674,
+      "learning_rate": 0.0008892077437596332,
+      "loss": 0.1391,
+      "step": 374
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.08771979063749313,
+      "learning_rate": 0.0008886218649679161,
+      "loss": 0.1375,
+      "step": 375
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.03339942544698715,
+      "learning_rate": 0.0008880346352642574,
+      "loss": 0.1368,
+      "step": 376
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.15352453291416168,
+      "learning_rate": 0.0008874460566899616,
+      "loss": 0.1447,
+      "step": 377
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.1778584122657776,
+      "learning_rate": 0.0008868561312910222,
+      "loss": 0.1189,
+      "step": 378
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.11893154680728912,
+      "learning_rate": 0.0008862648611181144,
+      "loss": 0.1167,
+      "step": 379
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.4323861598968506,
+      "learning_rate": 0.0008856722482265886,
+      "loss": 0.1691,
+      "step": 380
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.28813356161117554,
+      "learning_rate": 0.0008850782946764618,
+      "loss": 0.1505,
+      "step": 381
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5008757710456848,
+      "learning_rate": 0.0008844830025324122,
+      "loss": 0.1671,
+      "step": 382
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.12061876803636551,
+      "learning_rate": 0.0008838863738637705,
+      "loss": 0.1375,
+      "step": 383
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6747052073478699,
+      "learning_rate": 0.0008832884107445138,
+      "loss": 0.1663,
+      "step": 384
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.18846777081489563,
+      "learning_rate": 0.0008826891152532579,
+      "loss": 0.1148,
+      "step": 385
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.0950111448764801,
+      "learning_rate": 0.0008820884894732497,
+      "loss": 0.1138,
+      "step": 386
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.42371127009391785,
+      "learning_rate": 0.0008814865354923613,
+      "loss": 0.142,
+      "step": 387
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.17662374675273895,
+      "learning_rate": 0.0008808832554030808,
+      "loss": 0.1255,
+      "step": 388
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7766286134719849,
+      "learning_rate": 0.0008802786513025068,
+      "loss": 0.1613,
+      "step": 389
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.49581214785575867,
+      "learning_rate": 0.0008796727252923403,
+      "loss": 0.1346,
+      "step": 390
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6148929595947266,
+      "learning_rate": 0.0008790654794788768,
+      "loss": 0.1426,
+      "step": 391
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.15860037505626678,
+      "learning_rate": 0.0008784569159730007,
+      "loss": 0.1382,
+      "step": 392
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6793199777603149,
+      "learning_rate": 0.0008778470368901761,
+      "loss": 0.1398,
+      "step": 393
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.40314817428588867,
+      "learning_rate": 0.0008772358443504404,
+      "loss": 0.1428,
+      "step": 394
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6403933167457581,
+      "learning_rate": 0.0008766233404783974,
+      "loss": 0.1556,
+      "step": 395
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.33554157614707947,
+      "learning_rate": 0.0008760095274032083,
+      "loss": 0.1439,
+      "step": 396
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.45690324902534485,
+      "learning_rate": 0.000875394407258586,
+      "loss": 0.1374,
+      "step": 397
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.0541120283305645,
+      "learning_rate": 0.0008747779821827868,
+      "loss": 0.1314,
+      "step": 398
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6533159613609314,
+      "learning_rate": 0.0008741602543186031,
+      "loss": 0.169,
+      "step": 399
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.4919282793998718,
+      "learning_rate": 0.0008735412258133561,
+      "loss": 0.1569,
+      "step": 400
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.30325594544410706,
+      "learning_rate": 0.0008729208988188881,
+      "loss": 0.1471,
+      "step": 401
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.3497300148010254,
+      "learning_rate": 0.0008722992754915554,
+      "loss": 0.1457,
+      "step": 402
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.22892774641513824,
+      "learning_rate": 0.0008716763579922203,
+      "loss": 0.1334,
+      "step": 403
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.20050272345542908,
+      "learning_rate": 0.0008710521484862439,
+      "loss": 0.1446,
+      "step": 404
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5029633641242981,
+      "learning_rate": 0.0008704266491434787,
+      "loss": 0.171,
+      "step": 405
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.2720576226711273,
+      "learning_rate": 0.0008697998621382607,
+      "loss": 0.144,
+      "step": 406
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.10961242765188217,
+      "learning_rate": 0.000869171789649402,
+      "loss": 0.1349,
+      "step": 407
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.13584192097187042,
+      "learning_rate": 0.0008685424338601833,
+      "loss": 0.1385,
+      "step": 408
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6586437821388245,
+      "learning_rate": 0.0008679117969583464,
+      "loss": 0.1459,
+      "step": 409
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.24006032943725586,
+      "learning_rate": 0.0008672798811360864,
+      "loss": 0.1344,
+      "step": 410
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1859387755393982,
+      "learning_rate": 0.0008666466885900438,
+      "loss": 0.1358,
+      "step": 411
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.5095134973526001,
+      "learning_rate": 0.0008660122215212977,
+      "loss": 0.1387,
+      "step": 412
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1827729493379593,
+      "learning_rate": 0.0008653764821353573,
+      "loss": 0.1377,
+      "step": 413
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.14332665503025055,
+      "learning_rate": 0.0008647394726421547,
+      "loss": 0.131,
+      "step": 414
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.383101224899292,
+      "learning_rate": 0.0008641011952560371,
+      "loss": 0.146,
+      "step": 415
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.19079791009426117,
+      "learning_rate": 0.000863461652195759,
+      "loss": 0.1255,
+      "step": 416
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.49537310004234314,
+      "learning_rate": 0.0008628208456844747,
+      "loss": 0.1602,
+      "step": 417
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5658069849014282,
+      "learning_rate": 0.0008621787779497306,
+      "loss": 0.1518,
+      "step": 418
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2572256326675415,
+      "learning_rate": 0.0008615354512234569,
+      "loss": 0.1369,
+      "step": 419
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.1088945865631104,
+      "learning_rate": 0.0008608908677419605,
+      "loss": 0.1773,
+      "step": 420
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.35405099391937256,
+      "learning_rate": 0.0008602450297459173,
+      "loss": 0.1441,
+      "step": 421
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.39150556921958923,
+      "learning_rate": 0.0008595979394803633,
+      "loss": 0.147,
+      "step": 422
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.07459918409585953,
+      "learning_rate": 0.0008589495991946885,
+      "loss": 0.1338,
+      "step": 423
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.2999761402606964,
+      "learning_rate": 0.0008583000111426276,
+      "loss": 0.1357,
+      "step": 424
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.28417065739631653,
+      "learning_rate": 0.0008576491775822525,
+      "loss": 0.1411,
+      "step": 425
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.32605019211769104,
+      "learning_rate": 0.0008569971007759657,
+      "loss": 0.1329,
+      "step": 426
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 0.13750587403774261,
+      "eval_runtime": 15.1749,
+      "eval_samples_per_second": 31.433,
+      "eval_steps_per_second": 7.908,
+      "step": 426
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.047430120408535004,
+      "learning_rate": 0.0008563437829904903,
+      "loss": 0.1373,
+      "step": 427
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.4616542160511017,
+      "learning_rate": 0.0008556892264968639,
+      "loss": 0.1534,
+      "step": 428
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.12317585945129395,
+      "learning_rate": 0.0008550334335704297,
+      "loss": 0.1338,
+      "step": 429
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.39604276418685913,
+      "learning_rate": 0.0008543764064908295,
+      "loss": 0.1434,
+      "step": 430
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3490678369998932,
+      "learning_rate": 0.0008537181475419944,
+      "loss": 0.1365,
+      "step": 431
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.15001270174980164,
+      "learning_rate": 0.0008530586590121383,
+      "loss": 0.1358,
+      "step": 432
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.33340635895729065,
+      "learning_rate": 0.0008523979431937492,
+      "loss": 0.1367,
+      "step": 433
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.06029750779271126,
+      "learning_rate": 0.0008517360023835809,
+      "loss": 0.1366,
+      "step": 434
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.07978738099336624,
+      "learning_rate": 0.0008510728388826463,
+      "loss": 0.1345,
+      "step": 435
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.27599036693573,
+      "learning_rate": 0.0008504084549962079,
+      "loss": 0.1447,
+      "step": 436
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.13302059471607208,
+      "learning_rate": 0.0008497428530337706,
+      "loss": 0.1407,
+      "step": 437
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.20869582891464233,
+      "learning_rate": 0.0008490760353090737,
+      "loss": 0.1374,
+      "step": 438
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.10881117731332779,
+      "learning_rate": 0.0008484080041400825,
+      "loss": 0.1429,
+      "step": 439
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.20344361662864685,
+      "learning_rate": 0.0008477387618489807,
+      "loss": 0.139,
+      "step": 440
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.07153432071208954,
+      "learning_rate": 0.0008470683107621615,
+      "loss": 0.1315,
+      "step": 441
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.08688751608133316,
+      "learning_rate": 0.0008463966532102207,
+      "loss": 0.1346,
+      "step": 442
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.06495650112628937,
+      "learning_rate": 0.0008457237915279476,
+      "loss": 0.1307,
+      "step": 443
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.1892390102148056,
+      "learning_rate": 0.0008450497280543173,
+      "loss": 0.12,
+      "step": 444
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2579623758792877,
+      "learning_rate": 0.0008443744651324827,
+      "loss": 0.1531,
+      "step": 445
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.149379700422287,
+      "learning_rate": 0.000843698005109766,
+      "loss": 0.1385,
+      "step": 446
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.19281132519245148,
+      "learning_rate": 0.0008430203503376506,
+      "loss": 0.1033,
+      "step": 447
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.33208444714546204,
+      "learning_rate": 0.0008423415031717733,
+      "loss": 0.1525,
+      "step": 448
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.15149784088134766,
+      "learning_rate": 0.0008416614659719157,
+      "loss": 0.1282,
+      "step": 449
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.24646438658237457,
+      "learning_rate": 0.0008409802411019962,
+      "loss": 0.1393,
+      "step": 450
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2505553662776947,
+      "learning_rate": 0.000840297830930062,
+      "loss": 0.1453,
+      "step": 451
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1632508784532547,
+      "learning_rate": 0.0008396142378282799,
+      "loss": 0.1274,
+      "step": 452
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.12370573729276657,
+      "learning_rate": 0.0008389294641729292,
+      "loss": 0.1201,
+      "step": 453
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.08046772330999374,
+      "learning_rate": 0.0008382435123443934,
+      "loss": 0.1263,
+      "step": 454
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.19015488028526306,
+      "learning_rate": 0.0008375563847271506,
+      "loss": 0.1318,
+      "step": 455
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.3562954366207123,
+      "learning_rate": 0.0008368680837097669,
+      "loss": 0.132,
+      "step": 456
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.06315189599990845,
+      "learning_rate": 0.000836178611684887,
+      "loss": 0.1113,
+      "step": 457
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.43667125701904297,
+      "learning_rate": 0.0008354879710492264,
+      "loss": 0.1908,
+      "step": 458
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.0708879753947258,
+      "learning_rate": 0.0008347961642035624,
+      "loss": 0.1399,
+      "step": 459
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.04855835437774658,
+      "learning_rate": 0.0008341031935527267,
+      "loss": 0.1258,
+      "step": 460
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1364990919828415,
+      "learning_rate": 0.0008334090615055965,
+      "loss": 0.1344,
+      "step": 461
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.08166524022817612,
+      "learning_rate": 0.0008327137704750862,
+      "loss": 0.134,
+      "step": 462
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.09308458864688873,
+      "learning_rate": 0.0008320173228781389,
+      "loss": 0.1507,
+      "step": 463
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.07796576619148254,
+      "learning_rate": 0.000831319721135718,
+      "loss": 0.1284,
+      "step": 464
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.12168626487255096,
+      "learning_rate": 0.0008306209676727993,
+      "loss": 0.148,
+      "step": 465
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.18862847983837128,
+      "learning_rate": 0.000829921064918362,
+      "loss": 0.1229,
+      "step": 466
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23615515232086182,
+      "learning_rate": 0.00082922001530538,
+      "loss": 0.1322,
+      "step": 467
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.34108766913414,
+      "learning_rate": 0.0008285178212708142,
+      "loss": 0.1338,
+      "step": 468
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.39579400420188904,
+      "learning_rate": 0.0008278144852556042,
+      "loss": 0.1341,
+      "step": 469
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.2620592713356018,
+      "learning_rate": 0.0008271100097046585,
+      "loss": 0.1395,
+      "step": 470
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.08778171986341476,
+      "learning_rate": 0.0008264043970668469,
+      "loss": 0.1328,
+      "step": 471
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6086364388465881,
+      "learning_rate": 0.0008256976497949924,
+      "loss": 0.1271,
+      "step": 472
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.08982394635677338,
+      "learning_rate": 0.0008249897703458619,
+      "loss": 0.1346,
+      "step": 473
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.054080091416835785,
+      "learning_rate": 0.0008242807611801578,
+      "loss": 0.1218,
+      "step": 474
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5981457829475403,
+      "learning_rate": 0.0008235706247625098,
+      "loss": 0.1715,
+      "step": 475
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9139420986175537,
+      "learning_rate": 0.0008228593635614659,
+      "loss": 0.1983,
+      "step": 476
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.05938498303294182,
+      "learning_rate": 0.0008221469800494841,
+      "loss": 0.1308,
+      "step": 477
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.11526026576757431,
+      "learning_rate": 0.0008214334767029239,
+      "loss": 0.1422,
+      "step": 478
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.3049907386302948,
+      "learning_rate": 0.0008207188560020373,
+      "loss": 0.1419,
+      "step": 479
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.04782035946846008,
+      "learning_rate": 0.0008200031204309604,
+      "loss": 0.138,
+      "step": 480
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.12950918078422546,
+      "learning_rate": 0.000819286272477705,
+      "loss": 0.1315,
+      "step": 481
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.0429329015314579,
+      "learning_rate": 0.0008185683146341496,
+      "loss": 0.1354,
+      "step": 482
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.4792588949203491,
+      "learning_rate": 0.0008178492493960308,
+      "loss": 0.1476,
+      "step": 483
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.19784927368164062,
+      "learning_rate": 0.0008171290792629346,
+      "loss": 0.1394,
+      "step": 484
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.1172945499420166,
+      "learning_rate": 0.000816407806738288,
+      "loss": 0.1302,
+      "step": 485
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.3732689917087555,
+      "learning_rate": 0.0008156854343293501,
+      "loss": 0.1416,
+      "step": 486
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.5152392983436584,
+      "learning_rate": 0.0008149619645472031,
+      "loss": 0.1403,
+      "step": 487
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.15429601073265076,
+      "learning_rate": 0.000814237399906744,
+      "loss": 0.1322,
+      "step": 488
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.0002127885818481,
+      "learning_rate": 0.0008135117429266756,
+      "loss": 0.1303,
+      "step": 489
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.7232715487480164,
+      "learning_rate": 0.0008127849961294984,
+      "loss": 0.143,
+      "step": 490
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.13510456681251526,
+      "learning_rate": 0.0008120571620415006,
+      "loss": 0.1536,
+      "step": 491
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.5168789625167847,
+      "learning_rate": 0.0008113282431927503,
+      "loss": 0.1312,
+      "step": 492
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.7039850950241089,
+      "learning_rate": 0.000810598242117086,
+      "loss": 0.118,
+      "step": 493
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.5126641988754272,
+      "learning_rate": 0.0008098671613521089,
+      "loss": 0.2343,
+      "step": 494
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6958308815956116,
+      "learning_rate": 0.0008091350034391731,
+      "loss": 0.1648,
+      "step": 495
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 6.979303359985352,
+      "learning_rate": 0.0008084017709233766,
+      "loss": 0.2261,
+      "step": 496
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.3389752507209778,
+      "learning_rate": 0.0008076674663535537,
+      "loss": 0.146,
+      "step": 497
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.19990071654319763,
+      "learning_rate": 0.0008069320922822643,
+      "loss": 0.1429,
+      "step": 498
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.33689868450164795,
+      "learning_rate": 0.0008061956512657871,
+      "loss": 0.147,
+      "step": 499
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.09925112873315811,
+      "learning_rate": 0.000805458145864109,
+      "loss": 0.1342,
+      "step": 500
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.961702585220337,
+      "learning_rate": 0.0008047195786409172,
+      "loss": 0.1361,
+      "step": 501
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.4342229962348938,
+      "learning_rate": 0.0008039799521635895,
+      "loss": 0.1485,
+      "step": 502
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.1798858642578125,
+      "learning_rate": 0.0008032392690031867,
+      "loss": 0.1314,
+      "step": 503
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.3653756380081177,
+      "learning_rate": 0.0008024975317344421,
+      "loss": 0.1388,
+      "step": 504
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 9.677605628967285,
+      "learning_rate": 0.0008017547429357531,
+      "loss": 0.4186,
+      "step": 505
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 8.348475456237793,
+      "learning_rate": 0.0008010109051891731,
+      "loss": 0.3806,
+      "step": 506
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 35.19770050048828,
+      "learning_rate": 0.0008002660210804011,
+      "loss": 3.6145,
+      "step": 507
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.18663501739502,
+      "learning_rate": 0.0007995200931987743,
+      "loss": 0.6162,
+      "step": 508
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.05997322499752045,
+      "learning_rate": 0.0007987731241372571,
+      "loss": 0.1129,
+      "step": 509
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.41408172249794006,
+      "learning_rate": 0.000798025116492434,
+      "loss": 0.1512,
+      "step": 510
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.4445393979549408,
+      "learning_rate": 0.0007972760728644996,
+      "loss": 0.1463,
+      "step": 511
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.19678063690662384,
+      "learning_rate": 0.0007965259958572495,
+      "loss": 0.1386,
+      "step": 512
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.45497119426727295,
+      "learning_rate": 0.0007957748880780721,
+      "loss": 0.1373,
+      "step": 513
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6455509066581726,
+      "learning_rate": 0.0007950227521379381,
+      "loss": 0.1584,
+      "step": 514
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.3793765604496002,
+      "learning_rate": 0.0007942695906513929,
+      "loss": 0.1236,
+      "step": 515
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.20562775433063507,
+      "learning_rate": 0.0007935154062365467,
+      "loss": 0.1364,
+      "step": 516
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.3131325244903564,
+      "learning_rate": 0.0007927602015150655,
+      "loss": 0.1556,
+      "step": 517
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1705670803785324,
+      "learning_rate": 0.0007920039791121617,
+      "loss": 0.1372,
+      "step": 518
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 6.6207499504089355,
+      "learning_rate": 0.0007912467416565861,
+      "loss": 0.22,
+      "step": 519
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34343230724334717,
+      "learning_rate": 0.0007904884917806173,
+      "loss": 0.1453,
+      "step": 520
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.4290754497051239,
+      "learning_rate": 0.0007897292321200537,
+      "loss": 0.1177,
+      "step": 521
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.24469922482967377,
+      "learning_rate": 0.0007889689653142036,
+      "loss": 0.1369,
+      "step": 522
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.5307168960571289,
+      "learning_rate": 0.0007882076940058763,
+      "loss": 0.1542,
+      "step": 523
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.13802866637706757,
+      "learning_rate": 0.000787445420841373,
+      "loss": 0.1372,
+      "step": 524
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.36055922508239746,
+      "learning_rate": 0.0007866821484704776,
+      "loss": 0.1413,
+      "step": 525
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.36655113101005554,
+      "learning_rate": 0.0007859178795464472,
+      "loss": 0.1438,
+      "step": 526
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.6237390637397766,
+      "learning_rate": 0.0007851526167260034,
+      "loss": 0.1382,
+      "step": 527
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.42217007279396057,
+      "learning_rate": 0.0007843863626693221,
+      "loss": 0.1408,
+      "step": 528
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 24.023250579833984,
+      "learning_rate": 0.0007836191200400256,
+      "loss": 0.1517,
+      "step": 529
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.31599146127700806,
+      "learning_rate": 0.0007828508915051723,
+      "loss": 0.1353,
+      "step": 530
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6795622706413269,
+      "learning_rate": 0.0007820816797352479,
+      "loss": 0.1515,
+      "step": 531
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.37493640184402466,
+      "learning_rate": 0.0007813114874041557,
+      "loss": 0.141,
+      "step": 532
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.7365546226501465,
+      "learning_rate": 0.0007805403171892079,
+      "loss": 0.1347,
+      "step": 533
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 18.393390655517578,
+      "learning_rate": 0.000779768171771116,
+      "loss": 0.1753,
+      "step": 534
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 2.2978413105010986,
+      "learning_rate": 0.0007789950538339812,
+      "loss": 0.1418,
+      "step": 535
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.495151162147522,
+      "learning_rate": 0.0007782209660652854,
+      "loss": 0.146,
+      "step": 536
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 7.705572605133057,
+      "learning_rate": 0.0007774459111558821,
+      "loss": 0.2042,
+      "step": 537
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.6036086678504944,
+      "learning_rate": 0.0007766698917999862,
+      "loss": 0.1695,
+      "step": 538
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 127.21215057373047,
+      "learning_rate": 0.0007758929106951656,
+      "loss": 18.5136,
+      "step": 539
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 40.58448791503906,
+      "learning_rate": 0.0007751149705423312,
+      "loss": 0.5973,
+      "step": 540
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6296218633651733,
+      "learning_rate": 0.0007743360740457278,
+      "loss": 0.1849,
+      "step": 541
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.4533160924911499,
+      "learning_rate": 0.0007735562239129247,
+      "loss": 0.1464,
+      "step": 542
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2379036247730255,
+      "learning_rate": 0.0007727754228548058,
+      "loss": 0.1267,
+      "step": 543
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8904889225959778,
+      "learning_rate": 0.000771993673585561,
+      "loss": 0.2181,
+      "step": 544
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8934443593025208,
+      "learning_rate": 0.0007712109788226762,
+      "loss": 0.2158,
+      "step": 545
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.3368353545665741,
+      "learning_rate": 0.0007704273412869238,
+      "loss": 0.1489,
+      "step": 546
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2570180594921112,
+      "learning_rate": 0.0007696427637023537,
+      "loss": 0.144,
+      "step": 547
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 2.865034580230713,
+      "learning_rate": 0.0007688572487962834,
+      "loss": 0.1664,
+      "step": 548
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.4369525611400604,
+      "learning_rate": 0.0007680707992992888,
+      "loss": 0.1777,
+      "step": 549
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2545509934425354,
+      "learning_rate": 0.0007672834179451942,
+      "loss": 0.1536,
+      "step": 550
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.14455465972423553,
+      "learning_rate": 0.0007664951074710638,
+      "loss": 0.1256,
+      "step": 551
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.16001886129379272,
+      "learning_rate": 0.0007657058706171911,
+      "loss": 0.1356,
+      "step": 552
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.2537885308265686,
+      "learning_rate": 0.0007649157101270903,
+      "loss": 0.1393,
+      "step": 553
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.33060047030448914,
+      "learning_rate": 0.0007641246287474854,
+      "loss": 0.148,
+      "step": 554
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.691941499710083,
+      "learning_rate": 0.0007633326292283028,
+      "loss": 0.1764,
+      "step": 555
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.20472805202007294,
+      "learning_rate": 0.0007625397143226595,
+      "loss": 0.1424,
+      "step": 556
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7124485969543457,
+      "learning_rate": 0.0007617458867868553,
+      "loss": 0.1482,
+      "step": 557
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.09631184488534927,
+      "learning_rate": 0.0007609511493803615,
+      "loss": 0.1392,
+      "step": 558
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.20814809203147888,
+      "learning_rate": 0.0007601555048658133,
+      "loss": 0.1384,
+      "step": 559
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.19566737115383148,
+      "learning_rate": 0.0007593589560089984,
+      "loss": 0.1394,
+      "step": 560
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.13406091928482056,
+      "learning_rate": 0.0007585615055788484,
+      "loss": 0.1389,
+      "step": 561
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.07635807991027832,
+      "learning_rate": 0.0007577631563474291,
+      "loss": 0.1376,
+      "step": 562
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.11265091598033905,
+      "learning_rate": 0.0007569639110899302,
+      "loss": 0.1395,
+      "step": 563
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.31152746081352234,
+      "learning_rate": 0.0007561637725846567,
+      "loss": 0.1407,
+      "step": 564
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.13474373519420624,
+      "learning_rate": 0.0007553627436130183,
+      "loss": 0.1386,
+      "step": 565
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1695,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 565,
+  "total_flos": 5.169945694856806e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-565/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:242a9177bc415e2f72dc78b8f2eb1cd29a0e78da733db42237e6bb4cd1af9c7d
+size 5752

checkpoint-565/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "Qwen/Qwen1.5-7B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}