Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
README.md +227 -0
config.json +37 -0
generation_config.json +8 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
quantization_config.json +0 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,227 @@

+---
+base_model:
+- mistralai/Mistral-Nemo-Base-2407
+license: apache-2.0
+tags:
+- writing
+- creative-writing
+---
+# Koto 22B (Pretrained)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/cnBQlWjMTKGLOKMudPBVj.png)
+Koto-22B-PT is a [depth-upscaled](https://arxiv.org/abs/2312.15166) version of Mistral-Nemo-Base-2407, healed and trained on almost a billion tokens of creative writing data.
+## Usage
+This model is not intended for use outside of raw text completion settings, such as cowriting. Instruct will *not* work. Multi-turn roleplay will *not* work.
+It was trained at 32k, but as not all samples were this long, we expect that in the best case you can get ~16k effective context.
+We found that 1.5-1.55 temperature and 0.05-0.1 min_p worked best, but YMMV!
+## Datasets
+Some of the data used to train this model includes:
+- Most of [The Anarchist Library](https://theanarchistlibrary.org/), a repository for anarchist manifestos and writing (see [allura-org/the-anarchist-library](https://huggingface.co/datasets/allura-org/the-anarchist-library))
+- A random sample of public domain books from Project Gutenberg
+- Furry (anthro and feral) storytelling and smut
+- A small subset of known high-quality books and story data
+## Acknowledgements
+- thank you to [@takeshimaxfj](https://x.com/takeshimaxfj) on twitter for drawing the art used in the model card!
+- thank you very much to [mango/deltavector](https://huggingface.co/Delta-Vector) for providing the compute used to train this model
+- thanks to curse for testing, ideas
+- thanks to toasty for some data, ideas
+- thanks to everyone else in allura for moral support
+ilya <3
+## Technical Appendix
+<details>
+### Training Notes
+This model was trained over the course of ~14 hours on an 8xB200 node. We used 8-bit AdamW and the REX LR scheduler, as well as both gradient clipping and weight decay for regularization.
+There *was* a very odd loss spike ~60% of the way through training, but it recovered and the model seems fine? So? Eh? If it works it works :3
+### WandB
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/6XFFhkQD8lUFGerBrOAyd.png)
+### Finetuning Notes
+This model has had ChatML tokens already added if you prefer to tune using that chat format. Please do not readd them to maintain the vocab size for (possible) usage on places like Featherless
+### Axolotl Config
+```yaml
+## model
+base_model: allura-forge/nemo-upscaled-2
+#tokenizer_use_mistral_common: true
+## qlora COPE!!!
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+## data
+datasets:
+datasets:
+  - path: estrogen/bookscpt2
+    type: completion
+    field: text
+shuffle_merged_datasets: true
+dataset_prepared_path: dataset_preparedss
+val_set_size: 0.0
+output_dir: ./Pretrain
+## Liger + CCE
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_layer_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: false
+cut_cross_entropy: true
+## CTX settings
+sequence_len: 32768
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+## max grad norm
+max_grad_norm: 1.0
+## WandB
+wandb_project: NeMo-Upscale
+wandb_entity:
+wandb_watch:
+wandb_name: Pretrain-22B
+wandb_log_model:
+## hoe params
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: rex
+learning_rate: 2e-5
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+warmup_steps: 50
+saves_per_epoch: 2
+debug:
+deepspeed: ./deepspeed_configs/zero3_bf16.json
+weight_decay: 0.0025
+fsdp:
+fsdp_config:
+special_tokens:
+   pad_token: <pad>
+```
+### Mergekit Config
+```yaml
+dtype: bfloat16
+merge_method: passthrough
+slices:
+  # untouched intro
+  - sources:
+      - layer_range: [0, 8]
+        model: mistralai/Mistral-Nemo-Base-2407
+  - sources:
+      - layer_range: [8, 12]
+        model: mistralai/Mistral-Nemo-Base-2407
+  # 8–16 baseline
+  - sources:
+      - layer_range: [8, 16]
+        model: mistralai/Mistral-Nemo-Base-2407
+  # 8–16 duplicate with projections nulled
+  - sources:
+      - layer_range: [8, 16]
+        model: mistralai/Mistral-Nemo-Base-2407
+        parameters:
+          scale:
+            - filter: o_proj
+              value: 0.0
+            - filter: down_proj
+              value: 0.0
+            - value: 1.0
+  # 16–24 duplicate
+  - sources:
+      - layer_range: [16, 24]
+        model: mistralai/Mistral-Nemo-Base-2407
+        parameters:
+          scale:
+            - filter: o_proj
+              value: 0.0
+            - filter: down_proj
+              value: 0.0
+            - value: 1.0
+  # 16–24 baseline
+  - sources:
+      - layer_range: [16, 24]
+        model: mistralai/Mistral-Nemo-Base-2407
+  # 16–24 duplicate
+  - sources:
+      - layer_range: [16, 24]
+        model: mistralai/Mistral-Nemo-Base-2407
+        parameters:
+          scale:
+            - filter: o_proj
+              value: 0.0
+            - filter: down_proj
+              value: 0.0
+            - value: 1.0
+  # 24–32 baseline
+  - sources:
+      - layer_range: [24, 32]
+        model: mistralai/Mistral-Nemo-Base-2407
+  # 24–32 duplicate
+  - sources:
+      - layer_range: [24, 32]
+        model: mistralai/Mistral-Nemo-Base-2407
+        parameters:
+          scale:
+            - filter: o_proj
+              value: 0.0
+            - filter: down_proj
+              value: 0.0
+            - value: 1.0
+  # untouched tail
+  - sources:
+      - layer_range: [32, 40]
+        model: mistralai/Mistral-Nemo-Base-2407
+```
+</details>

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "architectures": [
+        "MistralForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 131072,
+    "model_type": "mistral",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 76,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.54.1",
+    "use_cache": false,
+    "vocab_size": 131074,
+    "quantization_config": {
+        "quant_method": "exl3",
+        "version": "0.0.5",
+        "bits": 4.5,
+        "head_bits": 6,
+        "calibration": {
+            "rows": 100,
+            "cols": 2048
+        },
+        "out_scales": "auto"
+    }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.54.1",
+  "use_cache": false
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:399b609ebfcc6511f2717ac34cfbb57cd32e6641096e76531a293e5b947d37d6
+size 8559303976

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ea78f97c4d8016ff26b7102912bf4dd386d490aa46a9cceeebcf5ec3727cde4
+size 4957196440

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

quantization_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48130f8c042761b84abbfbf10ad07efa7c26108a14e7a2a0402daa06e447a47a
+size 17078668

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff