train_data_path: null # Replace with the path to root of the training data directory with subdirectories for each generator (e.g. gp, kernel, etc.) model_path: ./models # Path where the model will be saved model_name: TempoPFN continue_training: false checkpoint_path: null # Replace with the path to the checkpoint file seed: 2025 wandb: true # whether to log to wandb wandb_project_name: TempoPFNTraining wandb_entity: university-of-freiburg-2024 wandb_plots: false batch_size: 40 num_training_iterations: 1000000 # 1M validation_batch_size: 64 num_validation_batches: 1 num_workers: 4 gradient_accumulation_enabled: true accumulation_steps: 5 # Number of batches to accumulate before updating (effective batch size = batch_size * accumulation_steps) log_interval: 2048 save_every: 100000 generator_proportions: forecast_pfn: 1.0 gp: 1.0 kernel: 1.0 sawtooth: 1.0 sinewave: 1.0 step: 1.0 anomaly: 1.0 spike: 1.0 cauker_univariate: 1.0 ou_process: 3.0 audio_financial_volatility: 0.1 audio_multi_scale_fractal: 0.1 audio_network_topology: 0.5 audio_stochastic_rhythm: 0.5 augmented_per_sample_2048: 2.0 augmented_temp_batch_2048: 2.0 # Learning Rate Scheduler Configuration lr_scheduler: cosine # Options: "warmup_stable_decay", "cosine_with_warmup", "cosine_with_restarts", "cosine" # Learning Rate Parameters peak_lr: 0.0002 # 2e-4 - Peak learning rate min_lr_ratio: 0.01 # Minimum LR as fraction of peak LR # WSD Scheduler Specific Parameters warmup_ratio: 0.003 # 0.3% of total steps for warmup stable_ratio: 0.90 # 90% of total steps at stable learning rate decay_type: cosine # Type of decay: "cosine" or "linear" # Alternative Scheduler Parameters (if using different schedulers) num_cycles: 0.5 # For cosine_with_warmup: 0.5 = half cosine wave num_restart_cycles: 4 # For cosine_with_restarts: number of restart cycles # Optimizer Configuration weight_decay: 0.01 # Weight decay for AdamW beta1: 0.9 # Adam beta1 parameter beta2: 0.98 # Adam beta2 parameter (optimized for transformers) optimizer_eps: 1e-6 # Adam epsilon # Training Stability gradient_clip_val: 100.0 scaler: custom_robust gift_eval: evaluate_on_gift_eval: false max_context_length: 3072 create_plots: false max_plots: 5 dataset_storage_path: null # Replace with the path to the dataset storage path data_augmentation: nan_augmentation: true scaler_augmentation: false length_shortening: true nan_stats_path: ./data/nan_stats.json augmentation_probabilities: scaler_augmentation: 0.5 TimeSeriesModel: # Core architecture embed_size: 512 num_encoder_layers: 10 # Scaling and preprocessing scaler: custom_robust epsilon: 0.00001 scaler_clamp_value: null handle_constants: false # Time features K_max: 25 time_feature_config: use_enhanced_features: true use_holiday_features: false use_index_features: true include_seasonality_info: true drop_enc_allow: false encoding_dropout: 0.0 # Encoder configuration encoder_config: attn_mode: chunk num_heads: 4 expand_v: 1.0 use_short_conv: true conv_size: 32 allow_neg_eigval: true hidden_ratio: 1.0 use_gate: true use_forget_gate: true num_householder: 4 weaving: true loss_type: 'quantile' quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]