| train_data_path: null # Replace with the path to root of the training data directory with subdirectories for each generator (e.g. gp, kernel, etc.) | |
| model_path: ./models # Path where the model will be saved | |
| model_name: TempoPFN | |
| continue_training: false | |
| checkpoint_path: null # Replace with the path to the checkpoint file | |
| seed: 2025 | |
| wandb: true # whether to log to wandb | |
| wandb_project_name: TempoPFNTraining | |
| wandb_entity: university-of-freiburg-2024 | |
| wandb_plots: false | |
| batch_size: 40 | |
| num_training_iterations: 1000000 # 1M | |
| validation_batch_size: 64 | |
| num_validation_batches: 1 | |
| num_workers: 4 | |
| gradient_accumulation_enabled: true | |
| accumulation_steps: 5 # Number of batches to accumulate before updating (effective batch size = batch_size * accumulation_steps) | |
| log_interval: 2048 | |
| save_every: 100000 | |
| generator_proportions: | |
| forecast_pfn: 1.0 | |
| gp: 1.0 | |
| kernel: 1.0 | |
| sawtooth: 1.0 | |
| sinewave: 1.0 | |
| step: 1.0 | |
| anomaly: 1.0 | |
| spike: 1.0 | |
| cauker_univariate: 1.0 | |
| ou_process: 3.0 | |
| audio_financial_volatility: 0.1 | |
| audio_multi_scale_fractal: 0.1 | |
| audio_network_topology: 0.5 | |
| audio_stochastic_rhythm: 0.5 | |
| augmented_per_sample_2048: 2.0 | |
| augmented_temp_batch_2048: 2.0 | |
| # Learning Rate Scheduler Configuration | |
| lr_scheduler: cosine # Options: "warmup_stable_decay", "cosine_with_warmup", "cosine_with_restarts", "cosine" | |
| # Learning Rate Parameters | |
| peak_lr: 0.0002 # 2e-4 - Peak learning rate | |
| min_lr_ratio: 0.01 # Minimum LR as fraction of peak LR | |
| # WSD Scheduler Specific Parameters | |
| warmup_ratio: 0.003 # 0.3% of total steps for warmup | |
| stable_ratio: 0.90 # 90% of total steps at stable learning rate | |
| decay_type: cosine # Type of decay: "cosine" or "linear" | |
| # Alternative Scheduler Parameters (if using different schedulers) | |
| num_cycles: 0.5 # For cosine_with_warmup: 0.5 = half cosine wave | |
| num_restart_cycles: 4 # For cosine_with_restarts: number of restart cycles | |
| # Optimizer Configuration | |
| weight_decay: 0.01 # Weight decay for AdamW | |
| beta1: 0.9 # Adam beta1 parameter | |
| beta2: 0.98 # Adam beta2 parameter (optimized for transformers) | |
| optimizer_eps: 1e-6 # Adam epsilon | |
| # Training Stability | |
| gradient_clip_val: 100.0 | |
| scaler: custom_robust | |
| gift_eval: | |
| evaluate_on_gift_eval: false | |
| max_context_length: 3072 | |
| create_plots: false | |
| max_plots: 5 | |
| dataset_storage_path: null # Replace with the path to the dataset storage path | |
| data_augmentation: | |
| nan_augmentation: true | |
| scaler_augmentation: false | |
| length_shortening: true | |
| nan_stats_path: ./data/nan_stats.json | |
| augmentation_probabilities: | |
| scaler_augmentation: 0.5 | |
| TimeSeriesModel: | |
| # Core architecture | |
| embed_size: 512 | |
| num_encoder_layers: 10 | |
| # Scaling and preprocessing | |
| scaler: custom_robust | |
| epsilon: 0.00001 | |
| scaler_clamp_value: null | |
| handle_constants: false | |
| # Time features | |
| K_max: 25 | |
| time_feature_config: | |
| use_enhanced_features: true | |
| use_holiday_features: false | |
| use_index_features: true | |
| include_seasonality_info: true | |
| drop_enc_allow: false | |
| encoding_dropout: 0.0 | |
| # Encoder configuration | |
| encoder_config: | |
| attn_mode: chunk | |
| num_heads: 4 | |
| expand_v: 1.0 | |
| use_short_conv: true | |
| conv_size: 32 | |
| allow_neg_eigval: true | |
| hidden_ratio: 1.0 | |
| use_gate: true | |
| use_forget_gate: true | |
| num_householder: 4 | |
| weaving: true | |
| loss_type: 'quantile' | |
| quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] |