See axolotl config

axolotl version: 0.13.0.dev0

# === Model Configuration ===
base_model: NewEden/Apertus-8B-2509-patched-chatML
load_in_8bit: false
load_in_4bit: false

# === HF Configuration === 
#hub_model_id: ToastyPigeon/muse-marvin-32k-lora-2
#hub_strategy: "every_save"
output_dir: apertus-v2/embedding-trained-2ep

# === Wandb Tracking ===
wandb_project: ApertusV2
# wandb_entity: [WANDB_ENTITY]
wandb_name: embeddings-2ep

# === Training Setup ===
num_epochs: 2
micro_batch_size: 1
gradient_accumulation_steps: 4
sequence_len: 4096
#sequence_parallel_degree: 2
#heads_k_stride: 1
sample_packing: true
#pad_to_sequence_len: true
#temperature: 0.7
#max_steps: 10
# === Evaluation ===
val_set_size: 0.025
evals_per_epoch: 10
#eval_steps: 20
#max_steps: 60
#eval_table_size:
eval_max_new_tokens: 128
#eval_sample_packing: true
#eval_strategy: "no"

# === LoRA Configuration ===
adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:
lora_target_modules:
#  - up_proj
#  - down_proj
#  - gate_proj
#  - q_proj
#  - v_proj
#  - k_proj
#  - o_proj
#  - input_layernorm
#  - post_attention_layernorm
#  - embed_tokens
#  - lm_head

lora_fan_in_fan_out:
#peft_use_rslora: true
lora_modules_to_save:
#  - embed_tokens
#  - lm_head
#fix_untrained_tokens: true
#lora_mlp_kernel: true
#lora_qkv_kernel: true
#lora_o_kernel: true
unfrozen_parameters:
  - embed_tokens
  - lm_head
# === Hyperparameter Configuration ===
#optimizer: apollo_adamw_layerwise
#warmup_steps: 0
warmup_ratio: 0.025
optimizer: adamw_torch_fused
#optimizer: paged_adamw_8bit
#optim_args:
#  enable_stochastic_rounding: true
#  enable_cautious: true
#  enable_8bit: true
# Apollo-mini configuration:
#optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
# Regular Apollo configuration:
# optim_args: 
#optim_target_modules: all_linear
learning_rate: 5e-5
lr_scheduler: cosine
#cosine_min_lr_ratio: 0.2
#lr_scheduler: cosine_with_min_lr
#lr_scheduler_kwargs:
#  cosine_min_lr: 1e-6
weight_decay: 0.01
max_grad_norm: 1.0
#warmup_steps: 0
#warmup_ratio: 0.025


# === Data Configuration ===
#
#chat_template: jinja
chat_template: chatml
special_tokens:
#  eos_token: "<|im_end|>"
#  eos_token: "</s>"
#tokenizer_use_mistral_common: true
shuffle_merged_datasets: true
datasets:
#  - path: grimulkan/LimaRP-augmented
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: allenai/tulu-3-sft-personas-instruction-following
#    type: chat_template
#    split: train[:10%]
#  - path: ToastyPigeon/mixed-medical-reasoning-formatted
#    type: chat_template
#    data_files: mixed-medical-thinking.json
#    split: train[:10%]
#  - path: ToastyPigeon/steve-and-marvin
#    type: completion
#    data_files: marvin.json
#  - path: ToastyPigeon/kimi-stories-completion
#    type: completion
#  - path: ToastyPigeon/new-story-dataset
 #   type: customcompletion-regex
#    type: completion
#    data_files: new-story-dataset-v2.json
#  - path: allura-org/fujin-instruct-v2
#    type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/some-rp-extended
 #   type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#    roles_to_train: ["user","assistant"]
  - path: allura-forge/koto-instruct-sft-nothink
#    type: customchatml-regex
    type: chat_template
#    split: train[:50%]
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/SpringDragon
#    type: customcompletion-regex
#    type: completion
#    split: train
#  - path: ToastyPigeon/some-erotica
#    type: customcompletion-regex
#    type: completion
#    split: train[:10%]
#  - path: ToastyPigeon/tulu-mini
#    type: chat_template
dataset_prepared_path: last_run_prepared


# === Plugins ===
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# === Hardware Optimization ===
#gradient_checkpointing: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
#liger_fused_linear_cross_entropy: true
cut_cross_entropy: true

#deepspeed: ../axolotl/deepspeed_configs/zero2.json

# === FSDP Config === 
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_activation_checkpointing: true
  fsdp_use_orig_params: true
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: ApertusDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD

# === Checkpointing ===
#save_steps: 10
saves_per_epoch: 1
save_total_limit: 1

# === Advanced Settings ===
bf16: auto
flash_attention: true
train_on_inputs: false
group_by_length: false
save_safetensors: true
logging_steps: 1

apertus-v2/embedding-trained-2ep

This model is a fine-tuned version of NewEden/Apertus-8B-2509-patched-chatML on the allura-forge/koto-instruct-sft-nothink dataset. It achieves the following results on the evaluation set:

Loss: 1.0096
Memory/max Active (gib): 5.33
Memory/max Allocated (gib): 5.33
Memory/device Reserved (gib): 18.79

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 5e-05
train_batch_size: 1
eval_batch_size: 1
seed: 42
distributed_type: multi-GPU
num_devices: 2
gradient_accumulation_steps: 4
total_train_batch_size: 8
total_eval_batch_size: 2
optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
lr_scheduler_type: cosine
lr_scheduler_warmup_steps: 43
training_steps: 1736

Training results

Training Loss	Epoch	Step	Validation Loss	Active (gib)	Allocated (gib)	Reserved (gib)
No log	0	0	1.1357	6.25	5.32	12.43
1.2319	0.1002	87	1.1015	5.33	5.33	18.79
0.9188	0.2003	174	1.0660	5.33	5.33	18.79
0.9956	0.3005	261	1.0509	5.33	5.33	18.79
1.0228	0.4007	348	1.0405	5.33	5.33	18.79
1.1445	0.5009	435	1.0353	5.33	5.33	18.79
0.9755	0.6010	522	1.0302	5.33	5.33	18.79
1.0101	0.7012	609	1.0275	5.33	5.33	18.79
0.9641	0.8014	696	1.0244	5.33	5.33	18.79
1.1194	0.9016	783	1.0215	5.33	5.33	18.79
1.1722	1.0012	870	1.0188	5.33	5.33	18.79
1.1047	1.1013	957	1.0171	5.33	5.33	18.79
0.9053	1.2015	1044	1.0152	5.33	5.33	18.79
0.927	1.3017	1131	1.0139	5.33	5.33	18.79
1.0436	1.4018	1218	1.0123	5.33	5.33	18.79
0.9647	1.5020	1305	1.0114	5.33	5.33	18.79
1.0689	1.6022	1392	1.0105	5.33	5.33	18.79
1.0046	1.7024	1479	1.0100	5.33	5.33	18.79
0.9518	1.8025	1566	1.0097	5.33	5.33	18.79
0.9851	1.9027	1653	1.0096	5.33	5.33	18.79

Framework versions

Transformers 4.56.1
Pytorch 2.7.1+cu126
Datasets 4.0.0
Tokenizers 0.22.1

Downloads last month: 10

Safetensors

Model size

4B params

Tensor type

F32

BF16

Model tree for Columbidae/Apertus-8B-2509-ChatML-trained-embeddings

Base model

NewEden/Apertus-8B-2509-patched-chatML

Finetuned

(1)

this model

Dataset used to train Columbidae/Apertus-8B-2509-ChatML-trained-embeddings

Evaluation results

Metadata error: specify a dataset to view leaderboard