TempoPFN / configs /example.yaml
Vladyslav Moroshan
Initial upload of TempoPFN model, code, and weights
c4b87d2
train_data_path: null # Replace with the path to root of the training data directory with subdirectories for each generator (e.g. gp, kernel, etc.)
model_path: ./models # Path where the model will be saved
model_name: TempoPFN
continue_training: false
checkpoint_path: null # Replace with the path to the checkpoint file
seed: 2025
wandb: true # whether to log to wandb
wandb_project_name: TempoPFNTraining
wandb_entity: university-of-freiburg-2024
wandb_plots: false
batch_size: 40
num_training_iterations: 1000000 # 1M
validation_batch_size: 64
num_validation_batches: 1
num_workers: 4
gradient_accumulation_enabled: true
accumulation_steps: 5 # Number of batches to accumulate before updating (effective batch size = batch_size * accumulation_steps)
log_interval: 2048
save_every: 100000
generator_proportions:
forecast_pfn: 1.0
gp: 1.0
kernel: 1.0
sawtooth: 1.0
sinewave: 1.0
step: 1.0
anomaly: 1.0
spike: 1.0
cauker_univariate: 1.0
ou_process: 3.0
audio_financial_volatility: 0.1
audio_multi_scale_fractal: 0.1
audio_network_topology: 0.5
audio_stochastic_rhythm: 0.5
augmented_per_sample_2048: 2.0
augmented_temp_batch_2048: 2.0
# Learning Rate Scheduler Configuration
lr_scheduler: cosine # Options: "warmup_stable_decay", "cosine_with_warmup", "cosine_with_restarts", "cosine"
# Learning Rate Parameters
peak_lr: 0.0002 # 2e-4 - Peak learning rate
min_lr_ratio: 0.01 # Minimum LR as fraction of peak LR
# WSD Scheduler Specific Parameters
warmup_ratio: 0.003 # 0.3% of total steps for warmup
stable_ratio: 0.90 # 90% of total steps at stable learning rate
decay_type: cosine # Type of decay: "cosine" or "linear"
# Alternative Scheduler Parameters (if using different schedulers)
num_cycles: 0.5 # For cosine_with_warmup: 0.5 = half cosine wave
num_restart_cycles: 4 # For cosine_with_restarts: number of restart cycles
# Optimizer Configuration
weight_decay: 0.01 # Weight decay for AdamW
beta1: 0.9 # Adam beta1 parameter
beta2: 0.98 # Adam beta2 parameter (optimized for transformers)
optimizer_eps: 1e-6 # Adam epsilon
# Training Stability
gradient_clip_val: 100.0
scaler: custom_robust
gift_eval:
evaluate_on_gift_eval: false
max_context_length: 3072
create_plots: false
max_plots: 5
dataset_storage_path: null # Replace with the path to the dataset storage path
data_augmentation:
nan_augmentation: true
scaler_augmentation: false
length_shortening: true
nan_stats_path: ./data/nan_stats.json
augmentation_probabilities:
scaler_augmentation: 0.5
TimeSeriesModel:
# Core architecture
embed_size: 512
num_encoder_layers: 10
# Scaling and preprocessing
scaler: custom_robust
epsilon: 0.00001
scaler_clamp_value: null
handle_constants: false
# Time features
K_max: 25
time_feature_config:
use_enhanced_features: true
use_holiday_features: false
use_index_features: true
include_seasonality_info: true
drop_enc_allow: false
encoding_dropout: 0.0
# Encoder configuration
encoder_config:
attn_mode: chunk
num_heads: 4
expand_v: 1.0
use_short_conv: true
conv_size: 32
allow_neg_eigval: true
hidden_ratio: 1.0
use_gate: true
use_forget_gate: true
num_householder: 4
weaving: true
loss_type: 'quantile'
quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]