Vladyslav Moroshan commited on Nov 7

Commit

0a58567

1 Parent(s): 4972944

Apply ruff formatting

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

examples/generate_synthetic_data.py +32 -68
examples/gift_eval/gift_eval_runner.py +25 -51
examples/gift_eval/gift_eval_submission.ipynb +116 -223
examples/quick_start_tempo_pfn.ipynb +7 -7
examples/quick_start_tempo_pfn.py +6 -15
examples/utils.py +7 -44
pyproject.toml +30 -0
src/data/augmentations.py +77 -182
src/data/batch_composer.py +51 -91
src/data/constants.py +1 -2
src/data/containers.py +20 -33
src/data/datasets.py +8 -15
src/data/filter.py +1 -3
src/data/frequency.py +13 -19
src/data/loaders.py +44 -82
src/data/scalers.py +24 -53
src/data/time_features.py +16 -40
src/data/utils.py +5 -6
src/gift_eval/__init__.py +5 -1
src/gift_eval/constants.py +2 -5
src/gift_eval/core.py +4 -7
src/gift_eval/data.py +12 -46
src/gift_eval/evaluate.py +34 -39
src/gift_eval/predictor.py +22 -40
src/gift_eval/results.py +17 -41
src/models/blocks.py +1 -4
src/models/gated_deltaproduct/configuration_gated_deltaproduct.py +3 -6
src/models/gated_deltaproduct/gated_deltaproduct.py +29 -60
src/models/gated_deltaproduct/modeling_gated_deltaproduct.py +10 -18
src/models/model.py +19 -53
src/optim/lr_scheduler.py +8 -21
src/plotting/gift_eval_utils.py +10 -21
src/plotting/plot_timeseries.py +37 -59
src/synthetic_generation/abstract_classes.py +6 -14
src/synthetic_generation/anomalies/anomaly_generator.py +13 -35
src/synthetic_generation/anomalies/anomaly_generator_wrapper.py +1 -6
src/synthetic_generation/audio_generators/financial_volatility_generator.py +5 -14
src/synthetic_generation/audio_generators/financial_volatility_wrapper.py +4 -5
src/synthetic_generation/audio_generators/multi_scale_fractal_generator.py +3 -8
src/synthetic_generation/audio_generators/multi_scale_fractal_wrapper.py +4 -5
src/synthetic_generation/audio_generators/network_topology_generator.py +3 -8
src/synthetic_generation/audio_generators/network_topology_wrapper.py +4 -5
src/synthetic_generation/audio_generators/stochastic_rhythm_generator.py +4 -11
src/synthetic_generation/audio_generators/stochastic_rhythm_wrapper.py +4 -5
src/synthetic_generation/audio_generators/utils.py +1 -1
src/synthetic_generation/augmentations/offline_per_sample_iid_augmentations.py +97 -228
src/synthetic_generation/augmentations/offline_temp_batch_augmentations.py +65 -140
src/synthetic_generation/cauker/cauker_generator.py +12 -22
src/synthetic_generation/cauker/cauker_generator_wrapper.py +3 -6
src/synthetic_generation/continuous_generation.py +30 -79

examples/generate_synthetic_data.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import logging
 import os
-from typing import List, Optional
 import torch
 from src.data.containers import BatchTimeSeriesContainer
 from src.data.utils import sample_future_length
 from src.plotting.plot_timeseries import plot_from_container
@@ -50,12 +49,17 @@ from src.synthetic_generation.spikes.spikes_generator_wrapper import (
 )
 from src.synthetic_generation.steps.step_generator_wrapper import StepGeneratorWrapper
-PYO_AVAILABLE = True
-try:
-    import pyo  # requires portaudio to be installed
-except (ImportError, OSError):
-    PYO_AVAILABLE = False
-else:
     from src.synthetic_generation.audio_generators.financial_volatility_wrapper import (
         FinancialVolatilityAudioWrapper,
     )
@@ -69,9 +73,7 @@ else:
         StochasticRhythmAudioWrapper,
     )
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 logger = logging.getLogger(__name__)
@@ -79,9 +81,9 @@ def visualize_batch_sample(
     generator,
     batch_size: int = 8,
     output_dir: str = "outputs/plots",
-    sample_idx: Optional[int] = None,
     prefix: str = "",
-    seed: Optional[int] = None,
 ) -> None:
     os.makedirs(output_dir, exist_ok=True)
     name = generator.__class__.__name__
@@ -105,78 +107,40 @@ def visualize_batch_sample(
     indices = [sample_idx] if sample_idx is not None else range(batch_size)
     for i in indices:
-        filename = (
-            f"{prefix}_{name.lower().replace('generatorwrapper', '')}_sample_{i}.png"
-        )
         output_file = os.path.join(output_dir, filename)
         title = f"{prefix.capitalize()} {name.replace('GeneratorWrapper', '')} Synthetic Series (Sample {i})"
-        plot_from_container(
-            container, sample_idx=i, output_file=output_file, show=False, title=title
-        )
         logger.info(f"[{name}] Saved plot to {output_file}")
-def generator_factory(global_seed: int, total_length: int) -> List:
     generators = [
-        KernelGeneratorWrapper(
-            KernelGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        GPGeneratorWrapper(
-            GPGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        ForecastPFNGeneratorWrapper(
-            ForecastPFNGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        SineWaveGeneratorWrapper(
-            SineWaveGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        SawToothGeneratorWrapper(
-            SawToothGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        StepGeneratorWrapper(
-            StepGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        AnomalyGeneratorWrapper(
-            AnomalyGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        SpikesGeneratorWrapper(
-            SpikesGeneratorParams(global_seed=global_seed, length=total_length)
-        ),
-        CauKerGeneratorWrapper(
-            CauKerGeneratorParams(
-                global_seed=global_seed, length=total_length, num_channels=5
-            )
-        ),
         OrnsteinUhlenbeckProcessGeneratorWrapper(
-            OrnsteinUhlenbeckProcessGeneratorParams(
-                global_seed=global_seed, length=total_length
-            )
         ),
     ]
     if PYO_AVAILABLE:
         generators.extend(
             [
-                StochasticRhythmAudioWrapper(
-                    StochasticRhythmAudioParams(
-                        global_seed=global_seed, length=total_length
-                    )
-                ),
                 FinancialVolatilityAudioWrapper(
-                    FinancialVolatilityAudioParams(
-                        global_seed=global_seed, length=total_length
-                    )
                 ),
                 MultiScaleFractalAudioWrapper(
-                    MultiScaleFractalAudioParams(
-                        global_seed=global_seed, length=total_length
-                    )
-                ),
-                NetworkTopologyAudioWrapper(
-                    NetworkTopologyAudioParams(
-                        global_seed=global_seed, length=total_length
-                    )
                 ),
             ]
         )
     else:

+import importlib
 import logging
 import os
 import torch
 from src.data.containers import BatchTimeSeriesContainer
 from src.data.utils import sample_future_length
 from src.plotting.plot_timeseries import plot_from_container
 )
 from src.synthetic_generation.steps.step_generator_wrapper import StepGeneratorWrapper
+PYO_AVAILABLE = False
+spec = importlib.util.find_spec("pyo")
+if spec is not None:
+    try:
+        _pyo = importlib.import_module("pyo")  # intentionally assigned to underscore to avoid unused-import lint
+    except (ImportError, OSError):
+        PYO_AVAILABLE = False
+    else:
+        PYO_AVAILABLE = True
+if PYO_AVAILABLE:
     from src.synthetic_generation.audio_generators.financial_volatility_wrapper import (
         FinancialVolatilityAudioWrapper,
     )
         StochasticRhythmAudioWrapper,
     )
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
     generator,
     batch_size: int = 8,
     output_dir: str = "outputs/plots",
+    sample_idx: int | None = None,
     prefix: str = "",
+    seed: int | None = None,
 ) -> None:
     os.makedirs(output_dir, exist_ok=True)
     name = generator.__class__.__name__
     indices = [sample_idx] if sample_idx is not None else range(batch_size)
     for i in indices:
+        filename = f"{prefix}_{name.lower().replace('generatorwrapper', '')}_sample_{i}.png"
         output_file = os.path.join(output_dir, filename)
         title = f"{prefix.capitalize()} {name.replace('GeneratorWrapper', '')} Synthetic Series (Sample {i})"
+        plot_from_container(container, sample_idx=i, output_file=output_file, show=False, title=title)
         logger.info(f"[{name}] Saved plot to {output_file}")
+def generator_factory(global_seed: int, total_length: int) -> list:
     generators = [
+        KernelGeneratorWrapper(KernelGeneratorParams(global_seed=global_seed, length=total_length)),
+        GPGeneratorWrapper(GPGeneratorParams(global_seed=global_seed, length=total_length)),
+        ForecastPFNGeneratorWrapper(ForecastPFNGeneratorParams(global_seed=global_seed, length=total_length)),
+        SineWaveGeneratorWrapper(SineWaveGeneratorParams(global_seed=global_seed, length=total_length)),
+        SawToothGeneratorWrapper(SawToothGeneratorParams(global_seed=global_seed, length=total_length)),
+        StepGeneratorWrapper(StepGeneratorParams(global_seed=global_seed, length=total_length)),
+        AnomalyGeneratorWrapper(AnomalyGeneratorParams(global_seed=global_seed, length=total_length)),
+        SpikesGeneratorWrapper(SpikesGeneratorParams(global_seed=global_seed, length=total_length)),
+        CauKerGeneratorWrapper(CauKerGeneratorParams(global_seed=global_seed, length=total_length, num_channels=5)),
         OrnsteinUhlenbeckProcessGeneratorWrapper(
+            OrnsteinUhlenbeckProcessGeneratorParams(global_seed=global_seed, length=total_length)
         ),
     ]
     if PYO_AVAILABLE:
         generators.extend(
             [
+                StochasticRhythmAudioWrapper(StochasticRhythmAudioParams(global_seed=global_seed, length=total_length)),
                 FinancialVolatilityAudioWrapper(
+                    FinancialVolatilityAudioParams(global_seed=global_seed, length=total_length)
                 ),
                 MultiScaleFractalAudioWrapper(
+                    MultiScaleFractalAudioParams(global_seed=global_seed, length=total_length)
                 ),
+                NetworkTopologyAudioWrapper(NetworkTopologyAudioParams(global_seed=global_seed, length=total_length)),
             ]
         )
     else:

examples/gift_eval/gift_eval_runner.py CHANGED Viewed

@@ -1,37 +1,33 @@
 #!/usr/bin/env python
 """
-GIFT-Eval Runner Script
 This script evaluates the Time Series model on GIFT-Eval datasets using the `src/gift_eval` pipeline.
 - Uses `src/gift_eval/data.py` for dataset handling.
 - Uses `src/gift_eval/predictor.TimeSeriesPredictor` for inference.
-- Loads a model from a checkpoint.
-- Writes per-dataset CSV metrics to `output_dir` without creating plots.
 """
 import argparse
 import logging
 from pathlib import Path
-from typing import List, Optional
-from examples.utils import download_checkpoint_if_needed
 from src.gift_eval.constants import ALL_DATASETS
 from src.gift_eval.evaluate import evaluate_datasets
 from src.gift_eval.predictor import TimeSeriesPredictor
 from src.gift_eval.results import aggregate_results, write_results_to_disk
 # Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 logging.getLogger("matplotlib").setLevel(logging.WARNING)
 logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
 logger = logging.getLogger("gift_eval_runner")
-def _expand_datasets_arg(datasets_arg: List[str] | str) -> List[str]:
     """Expand dataset argument to list of dataset names."""
     if isinstance(datasets_arg, str):
         if datasets_arg == "all":
@@ -50,12 +46,12 @@ def _expand_datasets_arg(datasets_arg: List[str] | str) -> List[str]:
 def run_evaluation(
     predictor: TimeSeriesPredictor,
-    datasets_arg: List[str] | str,
-    terms_arg: List[str],
     dataset_storage_path: str,
-    max_windows_arg: Optional[int],
     batch_size_arg: int,
-    max_context_length_arg: Optional[int],
     output_dir_arg: str,
     model_name_arg: str,
     after_each_dataset_flush: bool = True,
@@ -89,16 +85,13 @@ def run_evaluation(
 def main():
     """Main execution function."""
-    parser = argparse.ArgumentParser(
-        description="GIFT-Eval Runner: Evaluate TimeSeriesModel on GIFT-Eval datasets"
-    )
-    # Model configuration
     parser.add_argument(
         "--model_path",
         type=str,
-        default=None,
-        help="Path to model checkpoint. If not provided, will download from checkpoint_url.",
     )
     parser.add_argument(
         "--config_path",
@@ -106,18 +99,6 @@ def main():
         default="configs/example.yaml",
         help="Path to model config YAML (default: configs/example.yaml)",
     )
-    parser.add_argument(
-        "--checkpoint_url",
-        type=str,
-        default="https://www.dropbox.com/scl/fi/mqsni5lehooyaw93y3uzq/checkpoint_38M.pth?rlkey=3uyehvmtted02xkha24zgpzb6&st=seevsbkn&dl=0",
-        help="URL to download checkpoint from if model_path is not provided",
-    )
-    parser.add_argument(
-        "--download_dir",
-        type=str,
-        default="models",
-        help="Directory to download checkpoint to (default: models)",
-    )
     # Dataset configuration
     parser.add_argument(
@@ -185,29 +166,20 @@ def main():
     # Resolve paths
     config_path = Path(args.config_path)
-    download_dir = Path(args.download_dir)
     output_dir = Path(args.output_dir)
-    # Determine model path
-    resolved_model_path = None
-    if args.model_path:
-        resolved_model_path = args.model_path
-    elif args.checkpoint_url:
-        resolved_model_path = download_checkpoint_if_needed(
-            args.checkpoint_url, target_dir=download_dir
-        )
-    if not resolved_model_path:
-        raise FileNotFoundError(
-            "No model checkpoint provided. Set --model_path or --checkpoint_url."
-        )
     if not config_path.exists():
         raise FileNotFoundError(f"Config not found: {config_path}")
     logger.info("Loading predictor from checkpoint: %s", resolved_model_path)
     predictor = TimeSeriesPredictor.from_paths(
-        model_path=resolved_model_path,
         config_path=str(config_path),
         ds_prediction_length=1,  # placeholder; set per dataset
         ds_freq="D",  # placeholder; set per dataset
@@ -235,17 +207,19 @@ def main():
     )
     logger.info("Evaluation complete. See results under: %s", output_dir)
     # Aggregate all results into a single CSV file
     logger.info("Aggregating results from all datasets...")
     combined_df = aggregate_results(result_root_dir=output_dir)
     if combined_df is not None:
-        logger.info("Successfully created aggregated results file: %s/all_results.csv", output_dir)
     else:
         logger.warning("No results to aggregate. Check that evaluation completed successfully.")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python
 """
+GIFT-Eval Runner Script (Hugging Face Repository Version)
 This script evaluates the Time Series model on GIFT-Eval datasets using the `src/gift_eval` pipeline.
+- Assumes it is running inside the cloned Hugging Face repository.
 - Uses `src/gift_eval/data.py` for dataset handling.
 - Uses `src/gift_eval/predictor.TimeSeriesPredictor` for inference.
+- Loads the model from the local checkpoint (e.g., `models/checkpoint_38M.pth`).
+- Writes per-dataset CSV metrics to `output_dir`.
 """
 import argparse
 import logging
 from pathlib import Path
 from src.gift_eval.constants import ALL_DATASETS
 from src.gift_eval.evaluate import evaluate_datasets
 from src.gift_eval.predictor import TimeSeriesPredictor
 from src.gift_eval.results import aggregate_results, write_results_to_disk
 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logging.getLogger("matplotlib").setLevel(logging.WARNING)
 logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
 logger = logging.getLogger("gift_eval_runner")
+def _expand_datasets_arg(datasets_arg: list[str] | str) -> list[str]:
     """Expand dataset argument to list of dataset names."""
     if isinstance(datasets_arg, str):
         if datasets_arg == "all":
 def run_evaluation(
     predictor: TimeSeriesPredictor,
+    datasets_arg: list[str] | str,
+    terms_arg: list[str],
     dataset_storage_path: str,
+    max_windows_arg: int | None,
     batch_size_arg: int,
+    max_context_length_arg: int | None,
     output_dir_arg: str,
     model_name_arg: str,
     after_each_dataset_flush: bool = True,
 def main():
     """Main execution function."""
+    parser = argparse.ArgumentParser(description="GIFT-Eval Runner: Evaluate TimeSeriesModel on GIFT-Eval datasets")
     parser.add_argument(
         "--model_path",
         type=str,
+        default="models/checkpoint_38M.pth",
+        help="Path to a local model checkpoint (default: models/checkpoint_38M.pth in this repo).",
     )
     parser.add_argument(
         "--config_path",
         default="configs/example.yaml",
         help="Path to model config YAML (default: configs/example.yaml)",
     )
     # Dataset configuration
     parser.add_argument(
     # Resolve paths
     config_path = Path(args.config_path)
     output_dir = Path(args.output_dir)
+    resolved_model_path = Path(args.model_path)
+    if not resolved_model_path.exists():
+        logger.error(f"Model checkpoint not found at: {resolved_model_path}")
+        logger.error("Please ensure the file exists or you've cloned the repo using Git LFS.")
+        raise FileNotFoundError(f"No model checkpoint found at {resolved_model_path}")
     if not config_path.exists():
         raise FileNotFoundError(f"Config not found: {config_path}")
     logger.info("Loading predictor from checkpoint: %s", resolved_model_path)
     predictor = TimeSeriesPredictor.from_paths(
+        model_path=str(resolved_model_path),
         config_path=str(config_path),
         ds_prediction_length=1,  # placeholder; set per dataset
         ds_freq="D",  # placeholder; set per dataset
     )
     logger.info("Evaluation complete. See results under: %s", output_dir)
     # Aggregate all results into a single CSV file
     logger.info("Aggregating results from all datasets...")
     combined_df = aggregate_results(result_root_dir=output_dir)
     if combined_df is not None:
+        logger.info(
+            "Successfully created aggregated results file: %s/all_results.csv",
+            output_dir,
+        )
     else:
         logger.warning("No results to aggregate. Check that evaluation completed successfully.")
 if __name__ == "__main__":
     main()

examples/gift_eval/gift_eval_submission.ipynb CHANGED Viewed

@@ -41,38 +41,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
     "import logging\n",
-    "import os\n",
     "import math\n",
-    "import csv\n",
-    "import glob\n",
-    "import argparse\n",
     "import warnings\n",
-    "import yaml\n",
-    "from pathlib import Path\n",
-    "from typing import List, Optional, Dict, Tuple, Union, Iterator, Iterable, Any\n",
-    "from functools import cached_property\n",
-    "from enum import Enum\n",
     "from dataclasses import dataclass\n",
-    "\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import torch\n",
-    "from torch.nn.parallel import DistributedDataParallel as DDP\n",
-    "from dotenv import load_dotenv\n",
     "\n",
     "# GluonTS and Data Handling\n",
     "import datasets\n",
     "import pyarrow.compute as pc\n",
     "from gluonts.dataset import DataEntry\n",
     "from gluonts.dataset.common import ProcessDataEntry\n",
     "from gluonts.dataset.split import TestData, TrainingDataset, split\n",
-    "from gluonts.itertools import Map\n",
-    "from gluonts.time_feature import norm_freq_str, get_seasonality\n",
-    "from gluonts.transform import Transformation\n",
-    "from pandas.tseries.frequencies import to_offset\n",
-    "from toolz import compose\n",
     "\n",
     "# GluonTS Evaluation\n",
     "from gluonts.ev.metrics import (\n",
@@ -87,14 +82,14 @@
     "    SMAPE,\n",
     "    MeanWeightedSumQuantileLoss,\n",
     ")\n",
     "from gluonts.model.evaluation import evaluate_model\n",
     "from gluonts.model.forecast import QuantileForecast\n",
     "from gluonts.model.predictor import Predictor\n",
-    "\n",
-    "# Plotting and Warnings\n",
-    "import matplotlib\n",
-    "import matplotlib.pyplot as plt\n",
     "from linear_operator.utils.cholesky import NumericalWarning\n",
     "\n",
     "# --- TempoPFN Core Model Imports ---\n",
     "# These are assumed to be installed or in the PYTHONPATH\n",
@@ -103,6 +98,8 @@
     "from src.data.scalers import RobustScaler\n",
     "from src.models.model import TimeSeriesModel\n",
     "from src.utils.utils import device\n",
     "\n",
     "# --- Setup Logging ---\n",
     "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s - %(levelname)s - %(message)s\")\n",
@@ -111,6 +108,7 @@
     "logging.getLogger(\"PIL\").setLevel(logging.WARNING)\n",
     "logger = logging.getLogger(\"gift_eval_runner\")\n",
     "\n",
     "# Filter out specific gluonts warnings\n",
     "class WarningFilter(logging.Filter):\n",
     "    def __init__(self, text_to_filter: str) -> None:\n",
@@ -120,10 +118,9 @@
     "    def filter(self, record: logging.LogRecord) -> bool:\n",
     "        return self.text_to_filter not in record.getMessage()\n",
     "\n",
     "gts_logger = logging.getLogger(\"gluonts.model.forecast\")\n",
-    "gts_logger.addFilter(\n",
-    "    WarningFilter(\"The mean prediction is not stored in the forecast data\")\n",
-    ")\n",
     "\n",
     "# Filter out numerical warnings\n",
     "warnings.filterwarnings(\"ignore\", category=NumericalWarning)\n",
@@ -167,7 +164,7 @@
     "DATASET_PROPERTIES_PATH = _MODULE_DIR / \"data\" / \"dataset_properties.json\"\n",
     "\n",
     "try:\n",
-    "    with open(DATASET_PROPERTIES_PATH, \"r\") as f:\n",
     "        DATASET_PROPERTIES = json.load(f)\n",
     "except Exception as exc:  # pragma: no cover - logging path\n",
     "    DATASET_PROPERTIES = {}\n",
@@ -286,9 +283,7 @@
     "    RMSE(),\n",
     "    NRMSE(),\n",
     "    ND(),\n",
-    "    MeanWeightedSumQuantileLoss(\n",
-    "        quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]\n",
-    "    ),\n",
     ")\n",
     "\n",
     "# Standard metric names for CSV header\n",
@@ -342,14 +337,14 @@
     "    \"\"\"Container for evaluation results and optional figures.\"\"\"\n",
     "\n",
     "    dataset_metadata: DatasetMetadata\n",
-    "    metrics: Dict\n",
-    "    figures: List[Tuple[object, str]]\n",
     "\n",
     "\n",
-    "DatasetSelection = Union[List[str], Tuple[str, ...], str]\n",
     "\n",
     "\n",
-    "def expand_datasets_arg(datasets: DatasetSelection) -> List[str]:\n",
     "    \"\"\"Normalize dataset selection strings to explicit lists.\"\"\"\n",
     "\n",
     "    if isinstance(datasets, str):\n",
@@ -453,9 +448,7 @@
     "    def __init__(self, field):\n",
     "        self.field = field\n",
     "\n",
-    "    def __call__(\n",
-    "        self, data_it: Iterable[DataEntry], is_train: bool = False\n",
-    "    ) -> Iterator:\n",
     "        for data_entry in data_it:\n",
     "            item_id = data_entry[\"item_id\"]\n",
     "            val_ls = list(data_entry[self.field])\n",
@@ -473,12 +466,10 @@
     "        term: Term | str = Term.SHORT,\n",
     "        to_univariate: bool = False,\n",
     "        storage_path: str = None,\n",
-    "        max_windows: Optional[int] = None,\n",
     "    ):\n",
     "        storage_path = Path(storage_path)\n",
-    "        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format(\n",
-    "            \"numpy\"\n",
-    "        )\n",
     "        process = ProcessDataEntry(\n",
     "            self.freq,\n",
     "            one_dim_target=self.target_dim == 1,\n",
@@ -486,9 +477,7 @@
     "\n",
     "        self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)\n",
     "        if to_univariate:\n",
-    "            self.gluonts_dataset = MultivariateToUnivariate(\"target\").apply(\n",
-    "                self.gluonts_dataset\n",
-    "            )\n",
     "\n",
     "        self.term = Term(term)\n",
     "        self.name = name\n",
@@ -499,9 +488,7 @@
     "        freq = norm_freq_str(to_offset(self.freq).name)\n",
     "        if freq.endswith(\"E\"):\n",
     "            freq = freq[:-1]\n",
-    "        pred_len = (\n",
-    "            M4_PRED_LENGTH_MAP[freq] if \"m4\" in self.name else PRED_LENGTH_MAP[freq]\n",
-    "        )\n",
     "        return self.term.multiplier * pred_len\n",
     "\n",
     "    @cached_property\n",
@@ -510,26 +497,13 @@
     "\n",
     "    @cached_property\n",
     "    def target_dim(self) -> int:\n",
-    "        return (\n",
-    "            target.shape[0]\n",
-    "            if len((target := self.hf_dataset[0][\"target\"]).shape) > 1\n",
-    "            else 1\n",
-    "        )\n",
     "\n",
     "    @cached_property\n",
     "    def past_feat_dynamic_real_dim(self) -> int:\n",
     "        if \"past_feat_dynamic_real\" not in self.hf_dataset[0]:\n",
     "            return 0\n",
-    "        elif (\n",
-    "            len(\n",
-    "                (\n",
-    "                    past_feat_dynamic_real := self.hf_dataset[0][\n",
-    "                        \"past_feat_dynamic_real\"\n",
-    "                    ]\n",
-    "                ).shape\n",
-    "            )\n",
-    "            > 1\n",
-    "        ):\n",
     "            return past_feat_dynamic_real.shape[0]\n",
     "        else:\n",
     "            return 1\n",
@@ -544,11 +518,7 @@
     "    @cached_property\n",
     "    def _min_series_length(self) -> int:\n",
     "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
-    "            lengths = pc.list_value_length(\n",
-    "                pc.list_flatten(\n",
-    "                    pc.list_slice(self.hf_dataset.data.column(\"target\"), 0, 1)\n",
-    "                )\n",
-    "            )\n",
     "        else:\n",
     "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
     "        return min(lengths.to_numpy())\n",
@@ -556,32 +526,24 @@
     "    @cached_property\n",
     "    def sum_series_length(self) -> int:\n",
     "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
-    "            lengths = pc.list_value_length(\n",
-    "                pc.list_flatten(self.hf_dataset.data.column(\"target\"))\n",
-    "            )\n",
     "        else:\n",
     "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
     "        return sum(lengths.to_numpy())\n",
     "\n",
     "    @property\n",
     "    def training_dataset(self) -> TrainingDataset:\n",
-    "        training_dataset, _ = split(\n",
-    "            self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1)\n",
-    "        )\n",
     "        return training_dataset\n",
     "\n",
     "    @property\n",
     "    def validation_dataset(self) -> TrainingDataset:\n",
-    "        validation_dataset, _ = split(\n",
-    "            self.gluonts_dataset, offset=-self.prediction_length * self.windows\n",
-    "        )\n",
     "        return validation_dataset\n",
     "\n",
     "    @property\n",
     "    def test_data(self) -> TestData:\n",
-    "        _, test_template = split(\n",
-    "            self.gluonts_dataset, offset=-self.prediction_length * self.windows\n",
-    "        )\n",
     "        test_data = test_template.generate_instances(\n",
     "            prediction_length=self.prediction_length,\n",
     "            windows=self.windows,\n",
@@ -617,7 +579,7 @@
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
-    "        max_context_length: Optional[int] = None,\n",
     "        debug: bool = False,\n",
     "    ) -> None:\n",
     "        # Dataset-specific context (can be updated per dataset/term)\n",
@@ -633,9 +595,7 @@
     "        self.config = config\n",
     "\n",
     "        # Initialize scaler (using same type as model)\n",
-    "        scaler_type = self.config.get(\"TimeSeriesModel\", {}).get(\n",
-    "            \"scaler\", \"custom_robust\"\n",
-    "        )\n",
     "        epsilon = self.config.get(\"TimeSeriesModel\", {}).get(\"epsilon\", 1e-3)\n",
     "        if scaler_type == \"custom_robust\":\n",
     "            self.scaler = RobustScaler(epsilon=epsilon)\n",
@@ -644,10 +604,10 @@
     "\n",
     "    def set_dataset_context(\n",
     "        self,\n",
-    "        prediction_length: Optional[int] = None,\n",
-    "        freq: Optional[str] = None,\n",
-    "        batch_size: Optional[int] = None,\n",
-    "        max_context_length: Optional[int] = None,\n",
     "    ) -> None:\n",
     "        \"\"\"Update lightweight dataset-specific attributes without reloading the model.\"\"\"\n",
     "\n",
@@ -668,7 +628,7 @@
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
-    "        max_context_length: Optional[int] = None,\n",
     "        debug: bool = False,\n",
     "    ) -> \"TimeSeriesPredictor\":\n",
     "        return cls(\n",
@@ -689,10 +649,10 @@
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
-    "        max_context_length: Optional[int] = None,\n",
     "        debug: bool = False,\n",
     "    ) -> \"TimeSeriesPredictor\":\n",
-    "        with open(config_path, \"r\") as f:\n",
     "            config = yaml.safe_load(f)\n",
     "        model = cls._load_model_from_path(config=config, model_path=model_path)\n",
     "        return cls(\n",
@@ -738,13 +698,13 @@
     "                seq_len = min(seq_len, self.max_context_length)\n",
     "            return seq_len\n",
     "\n",
-    "        length_to_items: dict[int, List[tuple[int, object]]] = {}\n",
     "        for idx, entry in enumerate(test_data_input):\n",
     "            seq_len = _effective_length(entry)\n",
     "            length_to_items.setdefault(seq_len, []).append((idx, entry))\n",
     "\n",
     "        total = len(test_data_input)\n",
-    "        ordered_results: List[Optional[QuantileForecast]] = [None] * total\n",
     "\n",
     "        for _, items in length_to_items.items():\n",
     "            for i in range(0, len(items), self.batch_size):\n",
@@ -756,7 +716,7 @@
     "\n",
     "        return ordered_results  # type: ignore[return-value]\n",
     "\n",
-    "    def _predict_batch(self, test_data_batch: List) -> List[QuantileForecast]:\n",
     "        \"\"\"Generate predictions for a batch of time series.\"\"\"\n",
     "\n",
     "        logger.debug(f\"Processing batch of size: {len(test_data_batch)}\")\n",
@@ -778,9 +738,7 @@
     "                with torch.no_grad():\n",
     "                    model_output = self.model(batch_container, drop_enc_allow=False)\n",
     "\n",
-    "            forecasts = self._convert_to_forecasts(\n",
-    "                model_output, test_data_batch, batch_container\n",
-    "            )\n",
     "\n",
     "            logger.debug(f\"Generated {len(forecasts)} forecasts\")\n",
     "            return forecasts\n",
@@ -788,9 +746,7 @@
     "            logger.error(f\"Error in batch prediction: {exc}\")\n",
     "            raise\n",
     "\n",
-    "    def _convert_to_batch_container(\n",
-    "        self, test_data_batch: List\n",
-    "    ) -> BatchTimeSeriesContainer:\n",
     "        \"\"\"Convert gluonts test data to BatchTimeSeriesContainer.\"\"\"\n",
     "\n",
     "        batch_size = len(test_data_batch)\n",
@@ -806,10 +762,7 @@
     "            else:\n",
     "                target = target.T\n",
     "\n",
-    "            if (\n",
-    "                self.max_context_length is not None\n",
-    "                and len(target) > self.max_context_length\n",
-    "            ):\n",
     "                target = target[-self.max_context_length :]\n",
     "\n",
     "            history_values_list.append(target)\n",
@@ -819,9 +772,7 @@
     "        history_values_np = np.stack(history_values_list, axis=0)\n",
     "        num_channels = history_values_np.shape[2]\n",
     "\n",
-    "        history_values = torch.tensor(\n",
-    "            history_values_np, dtype=torch.float32, device=device\n",
-    "        )\n",
     "\n",
     "        future_values = torch.zeros(\n",
     "            (batch_size, self.ds_prediction_length, num_channels),\n",
@@ -839,28 +790,24 @@
     "    def _convert_to_forecasts(\n",
     "        self,\n",
     "        model_output: dict,\n",
-    "        test_data_batch: List,\n",
     "        batch_container: BatchTimeSeriesContainer,\n",
-    "    ) -> List[QuantileForecast]:\n",
     "        \"\"\"Convert model predictions to QuantileForecast objects.\"\"\"\n",
     "\n",
     "        predictions = model_output[\"result\"]\n",
     "        scale_statistics = model_output[\"scale_statistics\"]\n",
     "\n",
     "        if predictions.ndim == 4:\n",
-    "            predictions_unscaled = self.scaler.inverse_scale(\n",
-    "                predictions, scale_statistics\n",
-    "            )\n",
     "            is_quantile = True\n",
     "            quantile_levels = self.model.quantiles\n",
     "        else:\n",
-    "            predictions_unscaled = self.scaler.inverse_scale(\n",
-    "                predictions, scale_statistics\n",
-    "            )\n",
     "            is_quantile = False\n",
     "            quantile_levels = [0.5]\n",
     "\n",
-    "        forecasts: List[QuantileForecast] = []\n",
     "        for idx, entry in enumerate(test_data_batch):\n",
     "            history_length = int(batch_container.history_values.shape[1])\n",
     "            start_date = entry[\"start\"]\n",
@@ -931,7 +878,7 @@
     "\n",
     "\n",
     "def write_results_to_disk(\n",
-    "    items: List[EvaluationItem],\n",
     "    dataset_name: str,\n",
     "    output_dir: Path,\n",
     "    model_name: str,\n",
@@ -946,17 +893,13 @@
     "        writer = csv.writer(csvfile)\n",
     "        for item in items:\n",
     "            md: DatasetMetadata = item.dataset_metadata\n",
-    "            metric_values: List[Optional[float]] = []\n",
     "            for metric_name in STANDARD_METRIC_NAMES:\n",
     "                value = item.metrics.get(metric_name, None)\n",
     "                if value is None:\n",
     "                    metric_values.append(None)\n",
     "                else:\n",
-    "                    if (\n",
-    "                        hasattr(value, \"__len__\")\n",
-    "                        and not isinstance(value, (str, bytes))\n",
-    "                        and len(value) == 1\n",
-    "                    ):\n",
     "                        value = value[0]\n",
     "                    elif hasattr(value, \"item\"):\n",
     "                        value = value.item()\n",
@@ -965,9 +908,7 @@
     "            ds_key = md.key.lower()\n",
     "            props = DATASET_PROPERTIES.get(ds_key, {})\n",
     "            domain = props.get(\"domain\", \"unknown\")\n",
-    "            num_variates = props.get(\n",
-    "                \"num_variates\", 1 if md.to_univariate else md.target_dim\n",
-    "            )\n",
     "\n",
     "            row = [md.full_name, model_name] + metric_values + [domain, num_variates]\n",
     "            writer.writerow(row)\n",
@@ -989,11 +930,11 @@
     "        logger.info(\"Plots saved under %s\", output_dir / \"plots\")\n",
     "\n",
     "\n",
-    "def get_all_datasets_full_name() -> List[str]:\n",
     "    \"\"\"Get all possible dataset full names for validation.\"\"\"\n",
     "\n",
     "    terms = [\"short\", \"medium\", \"long\"]\n",
-    "    datasets_full_names: List[str] = []\n",
     "\n",
     "    for name in ALL_DATASETS:\n",
     "        for term in terms:\n",
@@ -1009,9 +950,7 @@
     "                ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
     "                ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get(\"frequency\")\n",
     "\n",
-    "            datasets_full_names.append(\n",
-    "                f\"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}\"\n",
-    "            )\n",
     "\n",
     "    return datasets_full_names\n",
     "\n",
@@ -1029,7 +968,7 @@
     "        logger.error(\"No result files found!\")\n",
     "        return None\n",
     "\n",
-    "    dataframes: List[pd.DataFrame] = []\n",
     "    for file in result_files:\n",
     "        try:\n",
     "            df = pd.read_csv(file)\n",
@@ -1049,26 +988,18 @@
     "    combined_df = pd.concat(dataframes, ignore_index=True).sort_values(\"dataset\")\n",
     "\n",
     "    if len(combined_df) != len(set(combined_df.dataset)):\n",
-    "        duplicate_datasets = combined_df.dataset[\n",
-    "            combined_df.dataset.duplicated()\n",
-    "        ].tolist()\n",
     "        logger.warning(\"Warning: Duplicate datasets found: %s\", duplicate_datasets)\n",
     "        combined_df = combined_df.drop_duplicates(subset=[\"dataset\"], keep=\"first\")\n",
-    "        logger.info(\n",
-    "            \"Removed duplicates, %s unique datasets remaining\", len(combined_df)\n",
-    "        )\n",
     "\n",
     "    logger.info(\"Combined results: %s datasets\", len(combined_df))\n",
     "\n",
     "    all_datasets_full_name = get_all_datasets_full_name()\n",
     "    completed_experiments = combined_df.dataset.tolist()\n",
     "\n",
-    "    completed_experiments_clean = [\n",
-    "        exp for exp in completed_experiments if exp in all_datasets_full_name\n",
-    "    ]\n",
-    "    missing_or_failed_experiments = [\n",
-    "        exp for exp in all_datasets_full_name if exp not in completed_experiments_clean\n",
-    "    ]\n",
     "\n",
     "    logger.info(\"=== EXPERIMENT SUMMARY ===\")\n",
     "    logger.info(\"Total expected datasets: %s\", len(all_datasets_full_name))\n",
@@ -1102,11 +1033,15 @@
     "def construct_evaluation_data(\n",
     "    dataset_name: str,\n",
     "    dataset_storage_path: str,\n",
-    "    terms: List[str] = [\"short\", \"medium\", \"long\"],\n",
-    "    max_windows: Optional[int] = None,\n",
-    ") -> List[Tuple[Dataset, DatasetMetadata]]:\n",
     "    \"\"\"Build datasets and rich metadata per term for a dataset name.\"\"\"\n",
-    "    sub_datasets: List[Tuple[Dataset, DatasetMetadata]] = []\n",
     "\n",
     "    if \"/\" in dataset_name:\n",
     "        ds_key, ds_freq = dataset_name.split(\"/\")\n",
@@ -1119,9 +1054,7 @@
     "\n",
     "    for term in terms:\n",
     "        # Skip medium/long terms for datasets that don't support them\n",
-    "        if (\n",
-    "            term == \"medium\" or term == \"long\"\n",
-    "        ) and dataset_name not in MED_LONG_DATASETS:\n",
     "            continue\n",
     "\n",
     "        # Probe once to determine dimensionality\n",
@@ -1146,7 +1079,7 @@
     "        # Compute metadata\n",
     "        season_length = get_seasonality(dataset.freq)\n",
     "        actual_freq = ds_freq if ds_freq else dataset.freq\n",
-    "        \n",
     "        metadata = DatasetMetadata(\n",
     "            full_name=f\"{ds_key}/{actual_freq}/{term}\",\n",
     "            key=ds_key,\n",
@@ -1168,14 +1101,18 @@
     "    predictor: TimeSeriesPredictor,\n",
     "    dataset: str,\n",
     "    dataset_storage_path: str,\n",
-    "    terms: List[str] = [\"short\", \"medium\", \"long\"],\n",
-    "    max_windows: Optional[int] = None,\n",
     "    batch_size: int = 48,\n",
-    "    max_context_length: Optional[int] = 1024,\n",
     "    create_plots: bool = False,\n",
     "    max_plots_per_dataset: int = 10,\n",
-    ") -> List[EvaluationItem]:\n",
     "    \"\"\"Evaluate predictor on one dataset across the requested terms.\"\"\"\n",
     "    sub_datasets = construct_evaluation_data(\n",
     "        dataset_name=dataset,\n",
     "        dataset_storage_path=dataset_storage_path,\n",
@@ -1183,7 +1120,7 @@
     "        max_windows=max_windows,\n",
     "    )\n",
     "\n",
-    "    results: List[EvaluationItem] = []\n",
     "    for i, (sub_dataset, metadata) in enumerate(sub_datasets):\n",
     "        logger.info(f\"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}\")\n",
     "        logger.info(f\"  Dataset size: {len(sub_dataset.test_data)}\")\n",
@@ -1211,16 +1148,16 @@
     "            seasonality=metadata.season_length,\n",
     "        )\n",
     "\n",
-    "        figs: List[Tuple[object, str]] = []\n",
     "        if create_plots:\n",
     "            # We are missing `src.plotting.gift_eval_utils.create_plots_for_dataset`\n",
     "            # As this was not provided, plotting will be skipped.\n",
-    "            logger.warning(\"Plotting is enabled but `create_plots_for_dataset` is not defined. Skipping plot generation.\")\n",
     "            pass\n",
     "\n",
-    "        results.append(\n",
-    "            EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs)\n",
-    "        )\n",
     "\n",
     "    return results"
    ]
@@ -1232,7 +1169,7 @@
    "source": [
     "## 4. Configuration\n",
     "\n",
-    "Set the parameters for the evaluation run. Update `config_path` and `checkpoint_url` to point to your model's files."
    ]
   },
   {
@@ -1243,64 +1180,28 @@
    "outputs": [],
    "source": [
     "# --- Parameters ---\n",
-    "model_path = None  # e.g., \"/path/to/checkpoint.pth\"; if None, try checkpoint_url\n",
-    "config_path = Path.cwd().parent.parent / \"configs/example.yaml\" \n",
-    "checkpoint_url = \"https://www.dropbox.com/scl/fi/mqsni5lehooyaw93y3uzq/checkpoint_38M.pth?rlkey=3uyehvmtted02xkha24zgpzb6&st=seevsbkn&dl=0\"   \n",
     "\n",
     "# --- Datasets and evaluation controls ---\n",
     "# Use a small subset for testing, e.g., [\"m4_weekly\"]\n",
-    "datasets_arg = [\"all\"] # list of dataset names or [\"all\"]. \n",
     "terms = [\"short\", \"medium\", \"long\"]\n",
     "dataset_storage_path = os.getenv(\"GIFT_EVAL_DATASET_STORAGE_PATH\")\n",
     "max_windows = 20\n",
     "batch_size = 64\n",
-    "max_context_length = 3072   \n",
     "\n",
     "# --- Output ---\n",
     "after_each_dataset_flush = True  # write CSV as each dataset completes\n",
     "model_name = \"TempoPFN\"\n",
-    "download_dir = Path.cwd().parent / \"models\"\n",
-    "output_dir = Path.cwd().parent / \"gift_eval_results\" / model_name\n",
     "\n",
-    "# --- Helper Functions ---\n",
-    "\n",
-    "def download_checkpoint_if_needed(url: str, target_dir: Path, target_filename: str = \"checkpoint.pth\") -> Path:\n",
-    "    \"\"\"Downloads a file from a URL if it doesn't exist.\"\"\"\n",
-    "    try:\n",
-    "        import requests\n",
-    "    except ImportError:\n",
-    "        logger.error(\"requests package not found. Please install it: pip install requests\")\n",
-    "        raise\n",
-    "        \n",
-    "    target_dir.mkdir(parents=True, exist_ok=True)\n",
-    "    target_file_path = target_dir / target_filename\n",
-    "    \n",
-    "    if target_file_path.exists():\n",
-    "        logger.info(f\"Checkpoint already exists: {target_file_path}\")\n",
-    "        return target_file_path\n",
-    "    \n",
-    "    logger.info(f\"Downloading checkpoint from {url} to {target_file_path}...\")\n",
-    "    \n",
-    "    # Handle Dropbox links\n",
-    "    if \"dropbox.com\" in url:\n",
-    "        url = url.replace(\"dl=0\", \"dl=1\").replace(\"st=\", \"dl=1&st=\")\n",
-    "        \n",
-    "    try:\n",
-    "        with requests.get(url, stream=True) as r:\n",
-    "            r.raise_for_status()\n",
-    "            with open(target_file_path, 'wb') as f:\n",
-    "                for chunk in r.iter_content(chunk_size=8192):\n",
-    "                    f.write(chunk)\n",
-    "        logger.info(\"Download complete.\")\n",
-    "        return target_file_path\n",
-    "    except Exception as e:\n",
-    "        logger.error(f\"Failed to download checkpoint: {e}\")\n",
-    "        if target_file_path.exists():\n",
-    "            os.remove(target_file_path) # Clean up partial download\n",
-    "        raise\n",
     "\n",
     "def _load_yaml(path: str) -> dict:\n",
-    "    with open(path, \"r\") as f:\n",
     "        return yaml.safe_load(f)"
    ]
   },
@@ -1324,27 +1225,19 @@
     "logger.info(\"Starting evaluation for model: %s\", model_name)\n",
     "\n",
     "# 1. Build predictor from a checkpoint\n",
-    "resolved_model_path = None\n",
-    "if model_path:\n",
-    "    resolved_model_path = model_path\n",
-    "elif checkpoint_url:\n",
-    "    resolved_model_path = download_checkpoint_if_needed(\n",
-    "        checkpoint_url, \n",
-    "        target_dir=download_dir,\n",
-    "        target_filename=f\"{model_name}_checkpoint.pth\"\n",
-    "    )\n",
     "\n",
-    "if not resolved_model_path or not Path(resolved_model_path).exists():\n",
-    "    raise FileNotFoundError(\n",
-    "        f\"No model checkpoint found. Set `model_path` or `checkpoint_url`. Tried: {resolved_model_path}\"\n",
-    "    )\n",
     "\n",
     "assert Path(config_path).exists(), f\"Config not found: {config_path}\"\n",
     "logger.info(\"Loading predictor from checkpoint: %s\", resolved_model_path)\n",
     "\n",
     "predictor = TimeSeriesPredictor.from_paths(\n",
-    "    model_path=resolved_model_path,\n",
-    "    config_path=config_path,\n",
     "    ds_prediction_length=1,  # placeholder; set per dataset\n",
     "    ds_freq=\"D\",  # placeholder; set per dataset\n",
     "    batch_size=batch_size,\n",
@@ -1380,7 +1273,7 @@
     "    except Exception as e:\n",
     "        logger.error(f\"FAILED evaluation for dataset: {ds_name}. Error: {e} !!!\")\n",
     "        logger.exception(e)\n",
-    "        continue # Continue to the next dataset\n",
     "\n",
     "print(f\"\\nEvaluation complete. See results under: {output_dir}\")"
    ]

    "metadata": {},
    "outputs": [],
    "source": [
+    "import csv\n",
+    "import glob\n",
     "import json\n",
     "import logging\n",
     "import math\n",
+    "import os\n",
     "import warnings\n",
+    "from collections.abc import Iterable, Iterator\n",
     "from dataclasses import dataclass\n",
+    "from enum import Enum\n",
+    "from functools import cached_property\n",
+    "from pathlib import Path\n",
     "\n",
     "# GluonTS and Data Handling\n",
     "import datasets\n",
+    "\n",
+    "# Plotting and Warnings\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
     "import pyarrow.compute as pc\n",
+    "import torch\n",
+    "import yaml\n",
+    "from dotenv import load_dotenv\n",
     "from gluonts.dataset import DataEntry\n",
     "from gluonts.dataset.common import ProcessDataEntry\n",
     "from gluonts.dataset.split import TestData, TrainingDataset, split\n",
     "\n",
     "# GluonTS Evaluation\n",
     "from gluonts.ev.metrics import (\n",
     "    SMAPE,\n",
     "    MeanWeightedSumQuantileLoss,\n",
     ")\n",
+    "from gluonts.itertools import Map\n",
     "from gluonts.model.evaluation import evaluate_model\n",
     "from gluonts.model.forecast import QuantileForecast\n",
     "from gluonts.model.predictor import Predictor\n",
+    "from gluonts.time_feature import get_seasonality, norm_freq_str\n",
+    "from gluonts.transform import Transformation\n",
     "from linear_operator.utils.cholesky import NumericalWarning\n",
+    "from pandas.tseries.frequencies import to_offset\n",
     "\n",
     "# --- TempoPFN Core Model Imports ---\n",
     "# These are assumed to be installed or in the PYTHONPATH\n",
     "from src.data.scalers import RobustScaler\n",
     "from src.models.model import TimeSeriesModel\n",
     "from src.utils.utils import device\n",
+    "from toolz import compose\n",
+    "from torch.nn.parallel import DistributedDataParallel as DDP\n",
     "\n",
     "# --- Setup Logging ---\n",
     "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s - %(levelname)s - %(message)s\")\n",
     "logging.getLogger(\"PIL\").setLevel(logging.WARNING)\n",
     "logger = logging.getLogger(\"gift_eval_runner\")\n",
     "\n",
+    "\n",
     "# Filter out specific gluonts warnings\n",
     "class WarningFilter(logging.Filter):\n",
     "    def __init__(self, text_to_filter: str) -> None:\n",
     "    def filter(self, record: logging.LogRecord) -> bool:\n",
     "        return self.text_to_filter not in record.getMessage()\n",
     "\n",
+    "\n",
     "gts_logger = logging.getLogger(\"gluonts.model.forecast\")\n",
+    "gts_logger.addFilter(WarningFilter(\"The mean prediction is not stored in the forecast data\"))\n",
     "\n",
     "# Filter out numerical warnings\n",
     "warnings.filterwarnings(\"ignore\", category=NumericalWarning)\n",
     "DATASET_PROPERTIES_PATH = _MODULE_DIR / \"data\" / \"dataset_properties.json\"\n",
     "\n",
     "try:\n",
+    "    with open(DATASET_PROPERTIES_PATH) as f:\n",
     "        DATASET_PROPERTIES = json.load(f)\n",
     "except Exception as exc:  # pragma: no cover - logging path\n",
     "    DATASET_PROPERTIES = {}\n",
     "    RMSE(),\n",
     "    NRMSE(),\n",
     "    ND(),\n",
+    "    MeanWeightedSumQuantileLoss(quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),\n",
     ")\n",
     "\n",
     "# Standard metric names for CSV header\n",
     "    \"\"\"Container for evaluation results and optional figures.\"\"\"\n",
     "\n",
     "    dataset_metadata: DatasetMetadata\n",
+    "    metrics: dict\n",
+    "    figures: list[tuple[object, str]]\n",
     "\n",
     "\n",
+    "DatasetSelection = list[str] | tuple[str, ...] | str\n",
     "\n",
     "\n",
+    "def expand_datasets_arg(datasets: DatasetSelection) -> list[str]:\n",
     "    \"\"\"Normalize dataset selection strings to explicit lists.\"\"\"\n",
     "\n",
     "    if isinstance(datasets, str):\n",
     "    def __init__(self, field):\n",
     "        self.field = field\n",
     "\n",
+    "    def __call__(self, data_it: Iterable[DataEntry], is_train: bool = False) -> Iterator:\n",
     "        for data_entry in data_it:\n",
     "            item_id = data_entry[\"item_id\"]\n",
     "            val_ls = list(data_entry[self.field])\n",
     "        term: Term | str = Term.SHORT,\n",
     "        to_univariate: bool = False,\n",
     "        storage_path: str = None,\n",
+    "        max_windows: int | None = None,\n",
     "    ):\n",
     "        storage_path = Path(storage_path)\n",
+    "        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format(\"numpy\")\n",
     "        process = ProcessDataEntry(\n",
     "            self.freq,\n",
     "            one_dim_target=self.target_dim == 1,\n",
     "\n",
     "        self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)\n",
     "        if to_univariate:\n",
+    "            self.gluonts_dataset = MultivariateToUnivariate(\"target\").apply(self.gluonts_dataset)\n",
     "\n",
     "        self.term = Term(term)\n",
     "        self.name = name\n",
     "        freq = norm_freq_str(to_offset(self.freq).name)\n",
     "        if freq.endswith(\"E\"):\n",
     "            freq = freq[:-1]\n",
+    "        pred_len = M4_PRED_LENGTH_MAP[freq] if \"m4\" in self.name else PRED_LENGTH_MAP[freq]\n",
     "        return self.term.multiplier * pred_len\n",
     "\n",
     "    @cached_property\n",
     "\n",
     "    @cached_property\n",
     "    def target_dim(self) -> int:\n",
+    "        return target.shape[0] if len((target := self.hf_dataset[0][\"target\"]).shape) > 1 else 1\n",
     "\n",
     "    @cached_property\n",
     "    def past_feat_dynamic_real_dim(self) -> int:\n",
     "        if \"past_feat_dynamic_real\" not in self.hf_dataset[0]:\n",
     "            return 0\n",
+    "        elif len((past_feat_dynamic_real := self.hf_dataset[0][\"past_feat_dynamic_real\"]).shape) > 1:\n",
     "            return past_feat_dynamic_real.shape[0]\n",
     "        else:\n",
     "            return 1\n",
     "    @cached_property\n",
     "    def _min_series_length(self) -> int:\n",
     "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
+    "            lengths = pc.list_value_length(pc.list_flatten(pc.list_slice(self.hf_dataset.data.column(\"target\"), 0, 1)))\n",
     "        else:\n",
     "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
     "        return min(lengths.to_numpy())\n",
     "    @cached_property\n",
     "    def sum_series_length(self) -> int:\n",
     "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
+    "            lengths = pc.list_value_length(pc.list_flatten(self.hf_dataset.data.column(\"target\")))\n",
     "        else:\n",
     "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
     "        return sum(lengths.to_numpy())\n",
     "\n",
     "    @property\n",
     "    def training_dataset(self) -> TrainingDataset:\n",
+    "        training_dataset, _ = split(self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1))\n",
     "        return training_dataset\n",
     "\n",
     "    @property\n",
     "    def validation_dataset(self) -> TrainingDataset:\n",
+    "        validation_dataset, _ = split(self.gluonts_dataset, offset=-self.prediction_length * self.windows)\n",
     "        return validation_dataset\n",
     "\n",
     "    @property\n",
     "    def test_data(self) -> TestData:\n",
+    "        _, test_template = split(self.gluonts_dataset, offset=-self.prediction_length * self.windows)\n",
     "        test_data = test_template.generate_instances(\n",
     "            prediction_length=self.prediction_length,\n",
     "            windows=self.windows,\n",
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
+    "        max_context_length: int | None = None,\n",
     "        debug: bool = False,\n",
     "    ) -> None:\n",
     "        # Dataset-specific context (can be updated per dataset/term)\n",
     "        self.config = config\n",
     "\n",
     "        # Initialize scaler (using same type as model)\n",
+    "        scaler_type = self.config.get(\"TimeSeriesModel\", {}).get(\"scaler\", \"custom_robust\")\n",
     "        epsilon = self.config.get(\"TimeSeriesModel\", {}).get(\"epsilon\", 1e-3)\n",
     "        if scaler_type == \"custom_robust\":\n",
     "            self.scaler = RobustScaler(epsilon=epsilon)\n",
     "\n",
     "    def set_dataset_context(\n",
     "        self,\n",
+    "        prediction_length: int | None = None,\n",
+    "        freq: str | None = None,\n",
+    "        batch_size: int | None = None,\n",
+    "        max_context_length: int | None = None,\n",
     "    ) -> None:\n",
     "        \"\"\"Update lightweight dataset-specific attributes without reloading the model.\"\"\"\n",
     "\n",
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
+    "        max_context_length: int | None = None,\n",
     "        debug: bool = False,\n",
     "    ) -> \"TimeSeriesPredictor\":\n",
     "        return cls(\n",
     "        ds_prediction_length: int,\n",
     "        ds_freq: str,\n",
     "        batch_size: int = 32,\n",
+    "        max_context_length: int | None = None,\n",
     "        debug: bool = False,\n",
     "    ) -> \"TimeSeriesPredictor\":\n",
+    "        with open(config_path) as f:\n",
     "            config = yaml.safe_load(f)\n",
     "        model = cls._load_model_from_path(config=config, model_path=model_path)\n",
     "        return cls(\n",
     "                seq_len = min(seq_len, self.max_context_length)\n",
     "            return seq_len\n",
     "\n",
+    "        length_to_items: dict[int, list[tuple[int, object]]] = {}\n",
     "        for idx, entry in enumerate(test_data_input):\n",
     "            seq_len = _effective_length(entry)\n",
     "            length_to_items.setdefault(seq_len, []).append((idx, entry))\n",
     "\n",
     "        total = len(test_data_input)\n",
+    "        ordered_results: list[QuantileForecast | None] = [None] * total\n",
     "\n",
     "        for _, items in length_to_items.items():\n",
     "            for i in range(0, len(items), self.batch_size):\n",
     "\n",
     "        return ordered_results  # type: ignore[return-value]\n",
     "\n",
+    "    def _predict_batch(self, test_data_batch: list) -> list[QuantileForecast]:\n",
     "        \"\"\"Generate predictions for a batch of time series.\"\"\"\n",
     "\n",
     "        logger.debug(f\"Processing batch of size: {len(test_data_batch)}\")\n",
     "                with torch.no_grad():\n",
     "                    model_output = self.model(batch_container, drop_enc_allow=False)\n",
     "\n",
+    "            forecasts = self._convert_to_forecasts(model_output, test_data_batch, batch_container)\n",
     "\n",
     "            logger.debug(f\"Generated {len(forecasts)} forecasts\")\n",
     "            return forecasts\n",
     "            logger.error(f\"Error in batch prediction: {exc}\")\n",
     "            raise\n",
     "\n",
+    "    def _convert_to_batch_container(self, test_data_batch: list) -> BatchTimeSeriesContainer:\n",
     "        \"\"\"Convert gluonts test data to BatchTimeSeriesContainer.\"\"\"\n",
     "\n",
     "        batch_size = len(test_data_batch)\n",
     "            else:\n",
     "                target = target.T\n",
     "\n",
+    "            if self.max_context_length is not None and len(target) > self.max_context_length:\n",
     "                target = target[-self.max_context_length :]\n",
     "\n",
     "            history_values_list.append(target)\n",
     "        history_values_np = np.stack(history_values_list, axis=0)\n",
     "        num_channels = history_values_np.shape[2]\n",
     "\n",
+    "        history_values = torch.tensor(history_values_np, dtype=torch.float32, device=device)\n",
     "\n",
     "        future_values = torch.zeros(\n",
     "            (batch_size, self.ds_prediction_length, num_channels),\n",
     "    def _convert_to_forecasts(\n",
     "        self,\n",
     "        model_output: dict,\n",
+    "        test_data_batch: list,\n",
     "        batch_container: BatchTimeSeriesContainer,\n",
+    "    ) -> list[QuantileForecast]:\n",
     "        \"\"\"Convert model predictions to QuantileForecast objects.\"\"\"\n",
     "\n",
     "        predictions = model_output[\"result\"]\n",
     "        scale_statistics = model_output[\"scale_statistics\"]\n",
     "\n",
     "        if predictions.ndim == 4:\n",
+    "            predictions_unscaled = self.scaler.inverse_scale(predictions, scale_statistics)\n",
     "            is_quantile = True\n",
     "            quantile_levels = self.model.quantiles\n",
     "        else:\n",
+    "            predictions_unscaled = self.scaler.inverse_scale(predictions, scale_statistics)\n",
     "            is_quantile = False\n",
     "            quantile_levels = [0.5]\n",
     "\n",
+    "        forecasts: list[QuantileForecast] = []\n",
     "        for idx, entry in enumerate(test_data_batch):\n",
     "            history_length = int(batch_container.history_values.shape[1])\n",
     "            start_date = entry[\"start\"]\n",
     "\n",
     "\n",
     "def write_results_to_disk(\n",
+    "    items: list[EvaluationItem],\n",
     "    dataset_name: str,\n",
     "    output_dir: Path,\n",
     "    model_name: str,\n",
     "        writer = csv.writer(csvfile)\n",
     "        for item in items:\n",
     "            md: DatasetMetadata = item.dataset_metadata\n",
+    "            metric_values: list[float | None] = []\n",
     "            for metric_name in STANDARD_METRIC_NAMES:\n",
     "                value = item.metrics.get(metric_name, None)\n",
     "                if value is None:\n",
     "                    metric_values.append(None)\n",
     "                else:\n",
+    "                    if hasattr(value, \"__len__\") and not isinstance(value, (str, bytes)) and len(value) == 1:\n",
     "                        value = value[0]\n",
     "                    elif hasattr(value, \"item\"):\n",
     "                        value = value.item()\n",
     "            ds_key = md.key.lower()\n",
     "            props = DATASET_PROPERTIES.get(ds_key, {})\n",
     "            domain = props.get(\"domain\", \"unknown\")\n",
+    "            num_variates = props.get(\"num_variates\", 1 if md.to_univariate else md.target_dim)\n",
     "\n",
     "            row = [md.full_name, model_name] + metric_values + [domain, num_variates]\n",
     "            writer.writerow(row)\n",
     "        logger.info(\"Plots saved under %s\", output_dir / \"plots\")\n",
     "\n",
     "\n",
+    "def get_all_datasets_full_name() -> list[str]:\n",
     "    \"\"\"Get all possible dataset full names for validation.\"\"\"\n",
     "\n",
     "    terms = [\"short\", \"medium\", \"long\"]\n",
+    "    datasets_full_names: list[str] = []\n",
     "\n",
     "    for name in ALL_DATASETS:\n",
     "        for term in terms:\n",
     "                ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
     "                ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get(\"frequency\")\n",
     "\n",
+    "            datasets_full_names.append(f\"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}\")\n",
     "\n",
     "    return datasets_full_names\n",
     "\n",
     "        logger.error(\"No result files found!\")\n",
     "        return None\n",
     "\n",
+    "    dataframes: list[pd.DataFrame] = []\n",
     "    for file in result_files:\n",
     "        try:\n",
     "            df = pd.read_csv(file)\n",
     "    combined_df = pd.concat(dataframes, ignore_index=True).sort_values(\"dataset\")\n",
     "\n",
     "    if len(combined_df) != len(set(combined_df.dataset)):\n",
+    "        duplicate_datasets = combined_df.dataset[combined_df.dataset.duplicated()].tolist()\n",
     "        logger.warning(\"Warning: Duplicate datasets found: %s\", duplicate_datasets)\n",
     "        combined_df = combined_df.drop_duplicates(subset=[\"dataset\"], keep=\"first\")\n",
+    "        logger.info(\"Removed duplicates, %s unique datasets remaining\", len(combined_df))\n",
     "\n",
     "    logger.info(\"Combined results: %s datasets\", len(combined_df))\n",
     "\n",
     "    all_datasets_full_name = get_all_datasets_full_name()\n",
     "    completed_experiments = combined_df.dataset.tolist()\n",
     "\n",
+    "    completed_experiments_clean = [exp for exp in completed_experiments if exp in all_datasets_full_name]\n",
+    "    missing_or_failed_experiments = [exp for exp in all_datasets_full_name if exp not in completed_experiments_clean]\n",
     "\n",
     "    logger.info(\"=== EXPERIMENT SUMMARY ===\")\n",
     "    logger.info(\"Total expected datasets: %s\", len(all_datasets_full_name))\n",
     "def construct_evaluation_data(\n",
     "    dataset_name: str,\n",
     "    dataset_storage_path: str,\n",
+    "    terms: list[str] | None = None,\n",
+    "    max_windows: int | None = None,\n",
+    ") -> list[tuple[Dataset, DatasetMetadata]]:\n",
     "    \"\"\"Build datasets and rich metadata per term for a dataset name.\"\"\"\n",
+    "    # Avoid mutable default argument\n",
+    "    if terms is None:\n",
+    "        terms = [\"short\", \"medium\", \"long\"]\n",
+    "\n",
+    "    sub_datasets: list[tuple[Dataset, DatasetMetadata]] = []\n",
     "\n",
     "    if \"/\" in dataset_name:\n",
     "        ds_key, ds_freq = dataset_name.split(\"/\")\n",
     "\n",
     "    for term in terms:\n",
     "        # Skip medium/long terms for datasets that don't support them\n",
+    "        if (term == \"medium\" or term == \"long\") and dataset_name not in MED_LONG_DATASETS:\n",
     "            continue\n",
     "\n",
     "        # Probe once to determine dimensionality\n",
     "        # Compute metadata\n",
     "        season_length = get_seasonality(dataset.freq)\n",
     "        actual_freq = ds_freq if ds_freq else dataset.freq\n",
+    "\n",
     "        metadata = DatasetMetadata(\n",
     "            full_name=f\"{ds_key}/{actual_freq}/{term}\",\n",
     "            key=ds_key,\n",
     "    predictor: TimeSeriesPredictor,\n",
     "    dataset: str,\n",
     "    dataset_storage_path: str,\n",
+    "    terms: list[str] | None = None,\n",
+    "    max_windows: int | None = None,\n",
     "    batch_size: int = 48,\n",
+    "    max_context_length: int | None = 1024,\n",
     "    create_plots: bool = False,\n",
     "    max_plots_per_dataset: int = 10,\n",
+    ") -> list[EvaluationItem]:\n",
     "    \"\"\"Evaluate predictor on one dataset across the requested terms.\"\"\"\n",
+    "    # Avoid mutable default argument\n",
+    "    if terms is None:\n",
+    "        terms = [\"short\", \"medium\", \"long\"]\n",
+    "\n",
     "    sub_datasets = construct_evaluation_data(\n",
     "        dataset_name=dataset,\n",
     "        dataset_storage_path=dataset_storage_path,\n",
     "        max_windows=max_windows,\n",
     "    )\n",
     "\n",
+    "    results: list[EvaluationItem] = []\n",
     "    for i, (sub_dataset, metadata) in enumerate(sub_datasets):\n",
     "        logger.info(f\"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}\")\n",
     "        logger.info(f\"  Dataset size: {len(sub_dataset.test_data)}\")\n",
     "            seasonality=metadata.season_length,\n",
     "        )\n",
     "\n",
+    "        figs: list[tuple[object, str]] = []\n",
     "        if create_plots:\n",
     "            # We are missing `src.plotting.gift_eval_utils.create_plots_for_dataset`\n",
     "            # As this was not provided, plotting will be skipped.\n",
+    "            logger.warning(\n",
+    "                \"Plotting is enabled but `create_plots_for_dataset` is not defined. Skipping plot generation.\"\n",
+    "            )\n",
     "            pass\n",
     "\n",
+    "        results.append(EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs))\n",
     "\n",
     "    return results"
    ]
    "source": [
     "## 4. Configuration\n",
     "\n",
+    "Set the parameters for the evaluation run. The script will load the model from the local `models/` directory by default."
    ]
   },
   {
    "outputs": [],
    "source": [
     "# --- Parameters ---\n",
+    "# Assumes the notebook is run from the root of the repo\n",
+    "model_path = Path.cwd() / \"models/checkpoint_38M.pth\"\n",
+    "config_path = Path.cwd() / \"configs/example.yaml\"\n",
     "\n",
     "# --- Datasets and evaluation controls ---\n",
     "# Use a small subset for testing, e.g., [\"m4_weekly\"]\n",
+    "datasets_arg = [\"all\"]  # list of dataset names or [\"all\"].\n",
     "terms = [\"short\", \"medium\", \"long\"]\n",
     "dataset_storage_path = os.getenv(\"GIFT_EVAL_DATASET_STORAGE_PATH\")\n",
     "max_windows = 20\n",
     "batch_size = 64\n",
+    "max_context_length = 3072\n",
     "\n",
     "# --- Output ---\n",
     "after_each_dataset_flush = True  # write CSV as each dataset completes\n",
     "model_name = \"TempoPFN\"\n",
+    "output_dir = Path.cwd() / \"gift_eval_results\" / model_name\n",
     "\n",
     "\n",
+    "# --- Helper Functions ---\n",
     "def _load_yaml(path: str) -> dict:\n",
+    "    with open(path) as f:\n",
     "        return yaml.safe_load(f)"
    ]
   },
     "logger.info(\"Starting evaluation for model: %s\", model_name)\n",
     "\n",
     "# 1. Build predictor from a checkpoint\n",
+    "resolved_model_path = Path(model_path)\n",
     "\n",
+    "if not resolved_model_path.exists():\n",
+    "    logger.error(f\"Model checkpoint not found at: {resolved_model_path}\")\n",
+    "    logger.error(\"Please ensure the file exists and you've cloned the repo using Git LFS.\")\n",
+    "    raise FileNotFoundError(f\"No model checkpoint found. Set `model_path` correctly. Tried: {resolved_model_path}\")\n",
     "\n",
     "assert Path(config_path).exists(), f\"Config not found: {config_path}\"\n",
     "logger.info(\"Loading predictor from checkpoint: %s\", resolved_model_path)\n",
     "\n",
     "predictor = TimeSeriesPredictor.from_paths(\n",
+    "    model_path=str(resolved_model_path),\n",
+    "    config_path=str(config_path),\n",
     "    ds_prediction_length=1,  # placeholder; set per dataset\n",
     "    ds_freq=\"D\",  # placeholder; set per dataset\n",
     "    batch_size=batch_size,\n",
     "    except Exception as e:\n",
     "        logger.error(f\"FAILED evaluation for dataset: {ds_name}. Error: {e} !!!\")\n",
     "        logger.exception(e)\n",
+    "        continue  # Continue to the next dataset\n",
     "\n",
     "print(f\"\\nEvaluation complete. See results under: {output_dir}\")"
    ]

examples/quick_start_tempo_pfn.ipynb CHANGED Viewed

@@ -30,11 +30,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import urllib.request\n",
-    "import torch\n",
-    "import numpy as np\n",
     "from pathlib import Path\n",
     "\n",
     "# Ensure CUDA is available\n",
     "if not torch.cuda.is_available():\n",
     "    raise RuntimeError(\"CUDA is required to run this demo. No CUDA device detected.\")\n",
@@ -47,7 +47,7 @@
     "    repo_root = repo_root.parent\n",
     "\n",
     "# Inline plotting\n",
-    "%matplotlib inline\n"
    ]
   },
   {
@@ -66,11 +66,11 @@
    "outputs": [],
    "source": [
     "CHECKPOINT_DIR = repo_root / \"models\"\n",
-    "CHECKPOINT_NAME = \"checkpoint_38M.pth\" \n",
     "CHECKPOINT_PATH = CHECKPOINT_DIR / CHECKPOINT_NAME\n",
     "\n",
     "# Ensure the models directory exists\n",
-    "CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True) \n",
     "\n",
     "if not CHECKPOINT_PATH.exists():\n",
     "    print(f\"--- WARNING: Checkpoint not found at: {CHECKPOINT_PATH} ---\")\n",
@@ -165,7 +165,7 @@
     "import yaml\n",
     "from src.models.model import TimeSeriesModel\n",
     "\n",
-    "with open(repo_root / \"configs/example.yaml\", \"r\") as f:\n",
     "    config = yaml.safe_load(f)\n",
     "\n",
     "model = TimeSeriesModel(**config[\"TimeSeriesModel\"]).to(device)\n",

    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
     "# Ensure CUDA is available\n",
     "if not torch.cuda.is_available():\n",
     "    raise RuntimeError(\"CUDA is required to run this demo. No CUDA device detected.\")\n",
     "    repo_root = repo_root.parent\n",
     "\n",
     "# Inline plotting\n",
+    "%matplotlib inline"
    ]
   },
   {
    "outputs": [],
    "source": [
     "CHECKPOINT_DIR = repo_root / \"models\"\n",
+    "CHECKPOINT_NAME = \"checkpoint_38M.pth\"\n",
     "CHECKPOINT_PATH = CHECKPOINT_DIR / CHECKPOINT_NAME\n",
     "\n",
     "# Ensure the models directory exists\n",
+    "CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)\n",
     "\n",
     "if not CHECKPOINT_PATH.exists():\n",
     "    print(f\"--- WARNING: Checkpoint not found at: {CHECKPOINT_PATH} ---\")\n",
     "import yaml\n",
     "from src.models.model import TimeSeriesModel\n",
     "\n",
+    "with open(repo_root / \"configs/example.yaml\") as f:\n",
     "    config = yaml.safe_load(f)\n",
     "\n",
     "model = TimeSeriesModel(**config[\"TimeSeriesModel\"]).to(device)\n",

examples/quick_start_tempo_pfn.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import argparse
 import logging
-import os
 import torch
 from examples.utils import (
     load_model,
     run_inference_and_plot,
@@ -15,9 +14,7 @@ from src.synthetic_generation.sine_waves.sine_wave_generator_wrapper import (
 )
 # Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 logger = logging.getLogger(__name__)
@@ -32,7 +29,7 @@ def main():
     )
     parser.add_argument(
         "--checkpoint",
-        default="models/checkpoint_38M.pth",
         help="Path to model checkpoint file (default: models/checkpoint_38M.pth)",
     )
     parser.add_argument("--batch_size", type=int, default=3)
@@ -49,13 +46,11 @@ def main():
     config_path = args.config
     model_path = args.checkpoint
     # Check if the checkpoint file exists
     if not os.path.exists(model_path):
         logger.error(f"Checkpoint file not found at: {model_path}")
         logger.error(
-            "Please ensure 'checkpoint_38M.pth' is in the root directory"
-            " (or that you've cloned the repo with Git LFS)."
         )
         logger.error("You can also specify a different path using --checkpoint.")
         return  # Exit if no model
@@ -75,9 +70,7 @@ def main():
     # 2) Load the pretrained model (CUDA-only). This demo requires a CUDA GPU.
     if not torch.cuda.is_available():
-        raise RuntimeError(
-            "CUDA is required to run this demo. No CUDA device detected."
-        )
     device = torch.device("cuda:0")
     model = load_model(config_path=config_path, model_path=model_path, device=device)
@@ -90,9 +83,7 @@ def main():
     )
     # 4) Run inference (bfloat16 on CUDA) and plot results
-    run_inference_and_plot(
-        model=model, container=container, output_dir=output_dir, use_bfloat16=True
-    )
     logger.info("=== Demo completed successfully! ===")

 import argparse
 import logging
+import os
 import torch
 from examples.utils import (
     load_model,
     run_inference_and_plot,
 )
 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
     )
     parser.add_argument(
         "--checkpoint",
+        default="models/checkpoint_38M.pth",
         help="Path to model checkpoint file (default: models/checkpoint_38M.pth)",
     )
     parser.add_argument("--batch_size", type=int, default=3)
     config_path = args.config
     model_path = args.checkpoint
     # Check if the checkpoint file exists
     if not os.path.exists(model_path):
         logger.error(f"Checkpoint file not found at: {model_path}")
         logger.error(
+            "Please ensure 'checkpoint_38M.pth' is in the root directory (or that you've cloned the repo with Git LFS)."
         )
         logger.error("You can also specify a different path using --checkpoint.")
         return  # Exit if no model
     # 2) Load the pretrained model (CUDA-only). This demo requires a CUDA GPU.
     if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required to run this demo. No CUDA device detected.")
     device = torch.device("cuda:0")
     model = load_model(config_path=config_path, model_path=model_path, device=device)
     )
     # 4) Run inference (bfloat16 on CUDA) and plot results
+    run_inference_and_plot(model=model, container=container, output_dir=output_dir, use_bfloat16=True)
     logger.info("=== Demo completed successfully! ===")

examples/utils.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import logging
 import os
-import urllib.request
-from typing import List
 import numpy as np
 import torch
 import yaml
 from src.data.containers import BatchTimeSeriesContainer
 from src.models.model import TimeSeriesModel
 from src.plotting.plot_timeseries import plot_from_container
@@ -14,11 +11,9 @@ from src.plotting.plot_timeseries import plot_from_container
 logger = logging.getLogger(__name__)
-def load_model(
-    config_path: str, model_path: str, device: torch.device
-) -> TimeSeriesModel:
     """Load the TimeSeriesModel from config and checkpoint."""
-    with open(config_path, "r") as f:
         config = yaml.safe_load(f)
     model = TimeSeriesModel(**config["TimeSeriesModel"]).to(device)
@@ -29,32 +24,10 @@ def load_model(
     return model
-def download_checkpoint_if_needed(url: str, target_dir: str = "models") -> str:
-    """Download checkpoint from URL into target_dir if not present and return its path.
-    Ensures direct download for Dropbox links by forcing dl=1.
-    """
-    os.makedirs(target_dir, exist_ok=True)
-    target_path = os.path.join(target_dir, "checkpoint.pth")
-    # Normalize Dropbox URL to force direct download
-    if "dropbox.com" in url and "dl=0" in url:
-        url = url.replace("dl=0", "dl=1")
-    if not os.path.exists(target_path):
-        logger.info(f"Downloading checkpoint from {url} to {target_path}...")
-        urllib.request.urlretrieve(url, target_path)
-        logger.info("Checkpoint downloaded successfully.")
-    else:
-        logger.info(f"Using existing checkpoint at {target_path}")
-    return target_path
 def plot_with_library(
     container: BatchTimeSeriesContainer,
     predictions_np: np.ndarray,  # [B, P, N, Q]
-    model_quantiles: List[float] | None,
     output_dir: str = "outputs",
     show_plots: bool = True,
     save_plots: bool = True,
@@ -62,11 +35,7 @@ def plot_with_library(
     os.makedirs(output_dir, exist_ok=True)
     batch_size = container.batch_size
     for i in range(batch_size):
-        output_file = (
-            os.path.join(output_dir, f"sine_wave_prediction_sample_{i + 1}.png")
-            if save_plots
-            else None
-        )
         plot_from_container(
             batch=container,
             sample_idx=i,
@@ -89,22 +58,16 @@ def run_inference_and_plot(
     autocast_enabled = use_bfloat16 and device_type == "cuda"
     with (
         torch.no_grad(),
-        torch.autocast(
-            device_type=device_type, dtype=torch.bfloat16, enabled=autocast_enabled
-        ),
     ):
         model_output = model(container)
     preds_full = model_output["result"].to(torch.float32)
     if hasattr(model, "scaler") and "scale_statistics" in model_output:
-        preds_full = model.scaler.inverse_scale(
-            preds_full, model_output["scale_statistics"]
-        )
     preds_np = preds_full.detach().cpu().numpy()
-    model_quantiles = (
-        model.quantiles if getattr(model, "loss_type", None) == "quantile" else None
-    )
     plot_with_library(
         container=container,
         predictions_np=preds_np,

 import logging
 import os
 import numpy as np
 import torch
 import yaml
 from src.data.containers import BatchTimeSeriesContainer
 from src.models.model import TimeSeriesModel
 from src.plotting.plot_timeseries import plot_from_container
 logger = logging.getLogger(__name__)
+def load_model(config_path: str, model_path: str, device: torch.device) -> TimeSeriesModel:
     """Load the TimeSeriesModel from config and checkpoint."""
+    with open(config_path) as f:
         config = yaml.safe_load(f)
     model = TimeSeriesModel(**config["TimeSeriesModel"]).to(device)
     return model
 def plot_with_library(
     container: BatchTimeSeriesContainer,
     predictions_np: np.ndarray,  # [B, P, N, Q]
+    model_quantiles: list[float] | None,
     output_dir: str = "outputs",
     show_plots: bool = True,
     save_plots: bool = True,
     os.makedirs(output_dir, exist_ok=True)
     batch_size = container.batch_size
     for i in range(batch_size):
+        output_file = os.path.join(output_dir, f"sine_wave_prediction_sample_{i + 1}.png") if save_plots else None
         plot_from_container(
             batch=container,
             sample_idx=i,
     autocast_enabled = use_bfloat16 and device_type == "cuda"
     with (
         torch.no_grad(),
+        torch.autocast(device_type=device_type, dtype=torch.bfloat16, enabled=autocast_enabled),
     ):
         model_output = model(container)
     preds_full = model_output["result"].to(torch.float32)
     if hasattr(model, "scaler") and "scale_statistics" in model_output:
+        preds_full = model.scaler.inverse_scale(preds_full, model_output["scale_statistics"])
     preds_np = preds_full.detach().cpu().numpy()
+    model_quantiles = model.quantiles if getattr(model, "loss_type", None) == "quantile" else None
     plot_with_library(
         container=container,
         predictions_np=preds_np,

pyproject.toml CHANGED Viewed

@@ -60,3 +60,33 @@ requires = ["setuptools>=68.2.2", "wheel>=0.41.2"]
 build-backend = "setuptools.build_meta"
 package-dir = {"" = "src"}

 build-backend = "setuptools.build_meta"
 package-dir = {"" = "src"}
+[tool.ruff]
+line-length = 120
+# Set the minimum Python version to target.
+target-version = "py312"
+# Define the source directories. This matches your project structure.
+src = ["src"]
+[tool.ruff.lint]
+# Select the rules to enable. This is a great starting set.
+# E = pycodestyle errors
+# F = Pyflakes (e.g., unused imports, undefined names)
+# I = isort (import sorting)
+# UP = pyupgrade (modernize Python syntax)
+# B = flake8-bugbear (common bugs and bad practices)
+# C4 = flake8-comprehensions (more efficient comprehensions)
+select = ["E", "F", "I", "UP", "B", "C4"]
+# You can ignore specific rules here. For example, if you
+# don't want to enforce docstrings, uncomment the line below:
+# ignore = ["D100", "D101", "D102", "D103"]
+[tool.ruff.format]
+# Use "black-compatible" formatting.
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"

src/data/augmentations.py CHANGED Viewed

@@ -2,15 +2,13 @@ import logging
 import math
 from collections import Counter
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
 import torch.nn as nn
 from joblib import Parallel, delayed
 from torch.quasirandom import SobolEngine
-import torch.nn.functional as F
 from src.gift_eval.data import Dataset
@@ -38,9 +36,7 @@ def analyze_datasets_for_augmentation(gift_eval_path_str: str) -> dict:
     Analyzes all datasets to derive statistics needed for NaN augmentation.
     This version collects the full distribution of NaN ratios.
     """
-    logger.info(
-        "--- Starting Dataset Analysis for Augmentation (Full Distribution) ---"
-    )
     path = Path(gift_eval_path_str)
     if not path.exists():
         raise FileNotFoundError(
@@ -79,18 +75,12 @@ def analyze_datasets_for_augmentation(gift_eval_path_str: str) -> dict:
                     nan_lengths = find_consecutive_nan_lengths(target)
                     all_consecutive_nan_lengths.update(nan_lengths)
         except Exception as e:
-            logger.warning(
-                f"Could not process {ds_name} for augmentation analysis: {e}"
-            )
     if total_series_count == 0:
-        raise ValueError(
-            "No series were found during augmentation analysis. Check dataset path."
-        )
-    p_series_has_nan = (
-        series_with_nans_count / total_series_count if total_series_count > 0 else 0
-    )
     logger.info("--- Augmentation Analysis Complete ---")
     # Print summary statistics
@@ -115,11 +105,11 @@ class NanAugmenter:
     def __init__(
         self,
         p_series_has_nan: float,
-        nan_ratio_distribution: List[float],
         nan_length_distribution: Counter,
         num_patterns: int = 100000,
         n_jobs: int = -1,
-        nan_patterns_path: Optional[str] = None,
     ):
         """
         Initializes the augmenter. NaN patterns are not generated at this stage.
@@ -138,7 +128,7 @@ class NanAugmenter:
         self.max_length = 2048
         self.nan_patterns_path = nan_patterns_path
         # Cache to store patterns: Dict[shape_tuple -> pattern_tensor]
-        self.pattern_cache: Dict[Tuple[int, ...], torch.BoolTensor] = {}
         if not nan_length_distribution or sum(nan_length_distribution.values()) == 0:
             self._has_block_distribution = False
@@ -146,10 +136,8 @@ class NanAugmenter:
         else:
             self._has_block_distribution = True
             total_blocks = sum(nan_length_distribution.values())
-            self.dist_lengths = list(int(i) for i in nan_length_distribution.keys())
-            self.dist_probs = [
-                count / total_blocks for count in nan_length_distribution.values()
-            ]
         if not self.nan_ratio_distribution:
             logger.warning("NaN ratio distribution is empty. Augmentation disabled.")
@@ -160,13 +148,11 @@ class NanAugmenter:
     def _load_existing_patterns(self):
         """Load existing NaN patterns from disk if they exist."""
         # Determine where to look for patterns
-        explicit_path: Optional[Path] = (
-            Path(self.nan_patterns_path).resolve()
-            if self.nan_patterns_path is not None
-            else None
         )
-        candidate_files: List[Path] = []
         if explicit_path is not None:
             # If the explicit path exists, use it directly
             if explicit_path.is_file():
@@ -174,20 +160,16 @@ class NanAugmenter:
             # Also search the directory of the explicit path for matching files
             explicit_dir = explicit_path.parent
             explicit_dir.mkdir(exist_ok=True, parents=True)
-            candidate_files.extend(
-                list(explicit_dir.glob(f"nan_patterns_{self.max_length}_*.pt"))
-            )
         else:
             # Default to the ./data directory
             data_dir = Path("data")
             data_dir.mkdir(exist_ok=True)
-            candidate_files.extend(
-                list(data_dir.glob(f"nan_patterns_{self.max_length}_*.pt"))
-            )
         # De-duplicate candidate files while preserving order
         seen: set[str] = set()
-        unique_candidates: List[Path] = []
         for f in candidate_files:
             key = str(f.resolve())
             if key not in seen:
@@ -207,9 +189,7 @@ class NanAugmenter:
                     cache_key = (self.max_length, num_channels)
                     self.pattern_cache[cache_key] = patterns
-                    logger.info(
-                        f"Loaded {patterns.shape[0]} patterns for shape {cache_key} from {pattern_file}"
-                    )
             except (ValueError, RuntimeError, FileNotFoundError) as e:
                 logger.warning(f"Failed to load patterns from {pattern_file}: {e}")
@@ -225,7 +205,7 @@ class NanAugmenter:
         return base_dir / f"nan_patterns_{self.max_length}_{num_channels}.pt"
-    def _generate_nan_mask(self, series_shape: Tuple[int, ...]) -> np.ndarray:
         """Generates a single boolean NaN mask for a given series shape."""
         series_size = int(np.prod(series_shape))
         sampled_ratio = np.random.choice(self.nan_ratio_distribution)
@@ -247,9 +227,7 @@ class NanAugmenter:
             if block_length <= 0:
                 break
-            nan_counts_in_window = np.convolve(
-                mask_flat, np.ones(block_length), mode="valid"
-            )
             valid_starts = np.where(nan_counts_in_window == 0)[0]
             if valid_starts.size == 0:
@@ -261,20 +239,15 @@ class NanAugmenter:
         return mask_flat.reshape(series_shape)
-    def _pregenerate_patterns(self, series_shape: Tuple[int, ...]) -> torch.BoolTensor:
         """Uses joblib to parallelize the generation of NaN masks for a given shape."""
         if not self._has_block_distribution or not self.nan_ratio_distribution:
             return torch.empty(0, *series_shape, dtype=torch.bool)
-        logger.info(
-            f"Generating {self.num_patterns} NaN patterns for shape {series_shape}..."
-        )
         with Parallel(n_jobs=self.n_jobs, backend="loky") as parallel:
-            masks_list = parallel(
-                delayed(self._generate_nan_mask)(series_shape)
-                for _ in range(self.num_patterns)
-            )
         logger.info(f"Pattern generation complete for shape {series_shape}.")
         return torch.from_numpy(np.stack(masks_list)).bool()
@@ -302,29 +275,19 @@ class NanAugmenter:
                 try:
                     patterns = torch.load(target_file, map_location="cpu")
                     self.pattern_cache[(self.max_length, num_channels)] = patterns
-                    logger.info(
-                        f"Loaded NaN patterns from {target_file} for shape {(self.max_length, num_channels)}"
-                    )
                 except (RuntimeError, FileNotFoundError):
                     # Fall back to generating if loading fails
-                    patterns = self._pregenerate_patterns(
-                        (self.max_length, num_channels)
-                    )
                     torch.save(patterns, target_file)
                     self.pattern_cache[(self.max_length, num_channels)] = patterns
-                    logger.info(
-                        f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}"
-                    )
             else:
                 patterns = self._pregenerate_patterns((self.max_length, num_channels))
                 torch.save(patterns, target_file)
                 self.pattern_cache[(self.max_length, num_channels)] = patterns
-                logger.info(
-                    f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}"
-                )
-        patterns = self.pattern_cache[(self.max_length, num_channels)][
-            :, :history_length, :
-        ]
         # Early exit if patterns are empty (e.g., generation failed or was disabled)
         if patterns.numel() == 0:
@@ -342,15 +305,13 @@ class NanAugmenter:
             return time_series_batch
         # 3. Randomly sample patterns for each series being augmented
-        pattern_indices = torch.randint(
-            0, patterns.shape[0], (num_to_augment,), device=device
-        )
         # 4. Select patterns and apply them in a single vectorized operation
         selected_patterns = patterns[pattern_indices].to(device)
-        time_series_batch[indices_to_augment] = time_series_batch[
-            indices_to_augment
-        ].masked_fill(selected_patterns, float("nan"))
         return time_series_batch
@@ -419,8 +380,8 @@ class QuantizationAugmenter:
     def __init__(
         self,
         p_quantize: float,
-        level_range: Tuple[int, int],
-        seed: Optional[int] = None,
     ):
         """
         Initializes the augmenter.
@@ -433,9 +394,7 @@ class QuantizationAugmenter:
         """
         assert 0.0 <= p_quantize <= 1.0, "Probability must be between 0 and 1."
         assert level_range[0] >= 2, "Minimum number of levels must be at least 2."
-        assert level_range[0] <= level_range[1], (
-            "Min levels cannot be greater than max."
-        )
         self.p_quantize = p_quantize
         self.level_range = level_range
@@ -445,9 +404,7 @@ class QuantizationAugmenter:
         max_intermediate_levels = self.level_range[1] - 2
         if max_intermediate_levels > 0:
             # SobolEngine must be created on CPU
-            self.sobol_engine = SobolEngine(
-                dimension=max_intermediate_levels, scramble=True, seed=seed
-            )
         else:
             self.sobol_engine = None
@@ -480,9 +437,7 @@ class QuantizationAugmenter:
         # 2. Determine a variable n_levels for EACH series
         min_l, max_l = self.level_range
-        n_levels_per_series = torch.randint(
-            min_l, max_l + 1, size=(n_augment,), device=device
-        )
         max_levels_in_batch = n_levels_per_series.max().item()
         # 3. Find min/max for each series
@@ -547,7 +502,7 @@ class MixUpAugmenter:
         p_combine: float = 0.4,
         p_time_dependent: float = 0.5,
         randomize_k_per_series: bool = True,
-        dirichlet_alpha_range: Tuple[float, float] = (0.1, 5.0),
     ):
         """
         Initializes the augmenter.
@@ -568,13 +523,8 @@ class MixUpAugmenter:
         """
         assert max_n_series_to_combine >= 2, "Must combine at least 2 series."
         assert 0.0 <= p_combine <= 1.0, "p_combine must be between 0 and 1."
-        assert 0.0 <= p_time_dependent <= 1.0, (
-            "p_time_dependent must be between 0 and 1."
-        )
-        assert (
-            dirichlet_alpha_range[0] > 0
-            and dirichlet_alpha_range[0] <= dirichlet_alpha_range[1]
-        )
         self.max_k = max_n_series_to_combine
         self.p_combine = p_combine
         self.p_time_dependent = p_time_dependent
@@ -628,9 +578,9 @@ class MixUpAugmenter:
         # 3. Interpolate between the endpoint weights over time
         # Reshape for broadcasting: w vectors become [k, 1], ramp becomes [1, length]
-        time_varying_weights = w_start.unsqueeze(1) * (
-            1 - alpha_ramp.unsqueeze(0)
-        ) + w_end.unsqueeze(1) * alpha_ramp.unsqueeze(0)
         # The result `time_varying_weights` has shape [k, length]
         # 4. Apply the time-varying weights
@@ -641,26 +591,20 @@ class MixUpAugmenter:
             return mixed_series, time_varying_weights
         return mixed_series
-    def transform(
-        self, time_series_batch: torch.Tensor, return_debug_info: bool = False
-    ):
         """
         Applies the mixup augmentation, randomly choosing between static and
         time-dependent mixing methods.
         """
         with torch.no_grad():
             if self.p_combine == 0:
-                return (
-                    (time_series_batch, {}) if return_debug_info else time_series_batch
-                )
             batch_size, _, _ = time_series_batch.shape
             device = time_series_batch.device
             if batch_size <= self.max_k:
-                return (
-                    (time_series_batch, {}) if return_debug_info else time_series_batch
-                )
             # 1. Decide which series to replace
             augment_mask = torch.rand(batch_size, device=device) < self.p_combine
@@ -668,9 +612,7 @@ class MixUpAugmenter:
             n_augment = indices_to_replace.numel()
             if n_augment == 0:
-                return (
-                    (time_series_batch, {}) if return_debug_info else time_series_batch
-                )
             # 2. Determine k for each series to augment
             if self.randomize_k:
@@ -699,14 +641,10 @@ class MixUpAugmenter:
                 # Randomly choose between static and time-dependent mixup
                 if torch.rand(1).item() < self.p_time_dependent:
-                    mixed_series, weights = self._simplex_path_mix(
-                        source_series, alpha=alpha, return_weights=True
-                    )
                     mix_type = "simplex"
                 else:
-                    mixed_series, weights = self._static_mix(
-                        source_series, alpha=alpha, return_weights=True
-                    )
                 new_series_list.append(mixed_series)
@@ -851,8 +789,8 @@ class DifferentialAugmenter:
     def __init__(
         self,
         p_transform: float,
-        gaussian_kernel_size_range: Tuple[int, int] = (5, 51),
-        gaussian_sigma_range: Tuple[float, float] = (2.0, 20.0),
     ):
         """
         Initializes the augmenter.
@@ -871,22 +809,15 @@ class DifferentialAugmenter:
         self.sigma_range = gaussian_sigma_range
         # Validate ranges
-        if not (
-            self.kernel_size_range[0] <= self.kernel_size_range[1]
-            and self.kernel_size_range[0] >= 3
-        ):
-            raise ValueError(
-                "Invalid kernel size range. Ensure min <= max and min >= 3."
-            )
         if not (self.sigma_range[0] <= self.sigma_range[1] and self.sigma_range[0] > 0):
             raise ValueError("Invalid sigma range. Ensure min <= max and min > 0.")
         # Cache for fixed-kernel convolution layers (Sobel, Laplace, etc.)
-        self.conv_cache: Dict[Tuple[int, torch.device], Dict[str, nn.Module]] = {}
-    def _create_fixed_kernel_layers(
-        self, num_channels: int, device: torch.device
-    ) -> dict:
         """
         Creates and configures nn.Conv1d layers for fixed-kernel derivative operations.
         These layers are cached to improve performance.
@@ -933,14 +864,10 @@ class DifferentialAugmenter:
         )
         sobel_kernel = (
-            torch.tensor([-1, 0, 1], device=device, dtype=torch.float32)
-            .view(1, 1, -1)
-            .repeat(num_channels, 1, 1)
         )
         laplace_kernel = (
-            torch.tensor([1, -2, 1], device=device, dtype=torch.float32)
-            .view(1, 1, -1)
-            .repeat(num_channels, 1, 1)
         )
         d3_kernel = (
             torch.tensor([-1, 2, 0, -2, 1], device=device, dtype=torch.float32)
@@ -995,9 +922,7 @@ class DifferentialAugmenter:
         gauss_conv.weight.requires_grad = False
         return gauss_conv
-    def _rescale_signal(
-        self, processed_signal: torch.Tensor, original_signal: torch.Tensor
-    ) -> torch.Tensor:
         """Rescales the processed signal to match the min/max range of the original."""
         original_min = torch.amin(original_signal, dim=2, keepdim=True)
         original_max = torch.amax(original_signal, dim=2, keepdim=True)
@@ -1037,15 +962,11 @@ class DifferentialAugmenter:
             sigma = (min_s + (max_s - min_s) * torch.rand(1)).item()
             # --- Get/Create Convolution Layers ---
-            gauss_conv = self._create_gaussian_layer(
-                kernel_size, sigma, num_channels, device
-            )
             cache_key = (num_channels, device)
             if cache_key not in self.conv_cache:
-                self.conv_cache[cache_key] = self._create_fixed_kernel_layers(
-                    num_channels, device
-                )
             fixed_layers = self.conv_cache[cache_key]
             # --- Apply Augmentations ---
@@ -1070,33 +991,17 @@ class DifferentialAugmenter:
             flipped_subset = torch.flip(subset_permuted, dims=[2])
             right_integral = torch.flip(torch.cumsum(flipped_subset, dim=2), dims=[2])
             left_integral = torch.cumsum(subset_permuted, dim=2)
-            integral_result = torch.where(
-                use_right_integral, right_integral, left_integral
-            )
-            integral_result_normalized = self._rescale_signal(
-                integral_result, subset_permuted
-            )
             # --- Assemble the results based on op_choices ---
             op_choices_view = op_choices.view(-1, 1, 1)
-            augmented_subset = torch.where(
-                op_choices_view == 0, gauss_result, subset_permuted
-            )
-            augmented_subset = torch.where(
-                op_choices_view == 1, sobel_result, augmented_subset
-            )
-            augmented_subset = torch.where(
-                op_choices_view == 2, laplace_result, augmented_subset
-            )
-            augmented_subset = torch.where(
-                op_choices_view == 3, integral_result_normalized, augmented_subset
-            )
-            augmented_subset = torch.where(
-                op_choices_view == 4, d3_result, augmented_subset
-            )
-            augmented_subset = torch.where(
-                op_choices_view == 5, d4_result, augmented_subset
-            )
             augmented_subset_final = augmented_subset.permute(0, 2, 1)
             augmented_batch = time_series_batch.clone()
@@ -1118,11 +1023,11 @@ class RandomConvAugmenter:
     def __init__(
         self,
         p_transform: float = 0.5,
-        kernel_size_range: Tuple[int, int] = (3, 31),
-        dilation_range: Tuple[int, int] = (1, 8),
-        layer_range: Tuple[int, int] = (1, 3),
-        sigma_range: Tuple[float, float] = (0.5, 5.0),
-        bias_range: Tuple[float, float] = (-0.5, 0.5),
     ):
         """
         Initializes the augmenter.
@@ -1138,9 +1043,7 @@ class RandomConvAugmenter:
                                                Gaussian kernels.
             bias_range (Tuple[float, float]): [min, max] range for the bias term.
         """
-        assert kernel_size_range[0] % 2 == 1 and kernel_size_range[1] % 2 == 1, (
-            "Kernel sizes must be odd."
-        )
         self.p_transform = p_transform
         self.kernel_size_range = kernel_size_range
@@ -1150,9 +1053,7 @@ class RandomConvAugmenter:
         self.bias_range = bias_range
         self.padding_modes = ["reflect", "replicate", "circular"]
-    def _rescale_signal(
-        self, processed_signal: torch.Tensor, original_signal: torch.Tensor
-    ) -> torch.Tensor:
         """Rescales the processed signal to match the min/max range of the original."""
         original_min = torch.amin(original_signal, dim=-1, keepdim=True)
         original_max = torch.amax(original_signal, dim=-1, keepdim=True)
@@ -1187,9 +1088,7 @@ class RandomConvAugmenter:
         num_channels = series.shape[1]
         device = series.device
-        num_layers = torch.randint(
-            self.layer_range[0], self.layer_range[1] + 1, (1,)
-        ).item()
         processed_series = series
         for i in range(num_layers):
@@ -1241,9 +1140,7 @@ class RandomConvAugmenter:
             else:  # Noisy Sobel kernel
                 # Ensure kernel is large enough for a Sobel filter
                 actual_kernel_size = 3 if kernel_size < 3 else kernel_size
-                sobel_base = torch.tensor(
-                    [-1, 0, 1], dtype=torch.float32, device=device
-                )
                 noise = torch.randn(3, device=device) * 0.1
                 noisy_sobel = sobel_base + noise
                 # Pad if the random kernel size is larger than 3
@@ -1302,9 +1199,7 @@ class RandomConvAugmenter:
                 original_series = subset_permuted[i : i + 1]
                 augmented_series = self._apply_random_conv_stack(original_series)
-                rescaled_series = self._rescale_signal(
-                    augmented_series.squeeze(0), original_series.squeeze(0)
-                )
                 augmented_subset_list.append(rescaled_series.unsqueeze(0))
             if augmented_subset_list:

 import math
 from collections import Counter
 from pathlib import Path
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from joblib import Parallel, delayed
 from torch.quasirandom import SobolEngine
 from src.gift_eval.data import Dataset
     Analyzes all datasets to derive statistics needed for NaN augmentation.
     This version collects the full distribution of NaN ratios.
     """
+    logger.info("--- Starting Dataset Analysis for Augmentation (Full Distribution) ---")
     path = Path(gift_eval_path_str)
     if not path.exists():
         raise FileNotFoundError(
                     nan_lengths = find_consecutive_nan_lengths(target)
                     all_consecutive_nan_lengths.update(nan_lengths)
         except Exception as e:
+            logger.warning(f"Could not process {ds_name} for augmentation analysis: {e}")
     if total_series_count == 0:
+        raise ValueError("No series were found during augmentation analysis. Check dataset path.")
+    p_series_has_nan = series_with_nans_count / total_series_count if total_series_count > 0 else 0
     logger.info("--- Augmentation Analysis Complete ---")
     # Print summary statistics
     def __init__(
         self,
         p_series_has_nan: float,
+        nan_ratio_distribution: list[float],
         nan_length_distribution: Counter,
         num_patterns: int = 100000,
         n_jobs: int = -1,
+        nan_patterns_path: str | None = None,
     ):
         """
         Initializes the augmenter. NaN patterns are not generated at this stage.
         self.max_length = 2048
         self.nan_patterns_path = nan_patterns_path
         # Cache to store patterns: Dict[shape_tuple -> pattern_tensor]
+        self.pattern_cache: dict[tuple[int, ...], torch.BoolTensor] = {}
         if not nan_length_distribution or sum(nan_length_distribution.values()) == 0:
             self._has_block_distribution = False
         else:
             self._has_block_distribution = True
             total_blocks = sum(nan_length_distribution.values())
+            self.dist_lengths = [int(i) for i in nan_length_distribution.keys()]
+            self.dist_probs = [count / total_blocks for count in nan_length_distribution.values()]
         if not self.nan_ratio_distribution:
             logger.warning("NaN ratio distribution is empty. Augmentation disabled.")
     def _load_existing_patterns(self):
         """Load existing NaN patterns from disk if they exist."""
         # Determine where to look for patterns
+        explicit_path: Path | None = (
+            Path(self.nan_patterns_path).resolve() if self.nan_patterns_path is not None else None
         )
+        candidate_files: list[Path] = []
         if explicit_path is not None:
             # If the explicit path exists, use it directly
             if explicit_path.is_file():
             # Also search the directory of the explicit path for matching files
             explicit_dir = explicit_path.parent
             explicit_dir.mkdir(exist_ok=True, parents=True)
+            candidate_files.extend(list(explicit_dir.glob(f"nan_patterns_{self.max_length}_*.pt")))
         else:
             # Default to the ./data directory
             data_dir = Path("data")
             data_dir.mkdir(exist_ok=True)
+            candidate_files.extend(list(data_dir.glob(f"nan_patterns_{self.max_length}_*.pt")))
         # De-duplicate candidate files while preserving order
         seen: set[str] = set()
+        unique_candidates: list[Path] = []
         for f in candidate_files:
             key = str(f.resolve())
             if key not in seen:
                     cache_key = (self.max_length, num_channels)
                     self.pattern_cache[cache_key] = patterns
+                    logger.info(f"Loaded {patterns.shape[0]} patterns for shape {cache_key} from {pattern_file}")
             except (ValueError, RuntimeError, FileNotFoundError) as e:
                 logger.warning(f"Failed to load patterns from {pattern_file}: {e}")
         return base_dir / f"nan_patterns_{self.max_length}_{num_channels}.pt"
+    def _generate_nan_mask(self, series_shape: tuple[int, ...]) -> np.ndarray:
         """Generates a single boolean NaN mask for a given series shape."""
         series_size = int(np.prod(series_shape))
         sampled_ratio = np.random.choice(self.nan_ratio_distribution)
             if block_length <= 0:
                 break
+            nan_counts_in_window = np.convolve(mask_flat, np.ones(block_length), mode="valid")
             valid_starts = np.where(nan_counts_in_window == 0)[0]
             if valid_starts.size == 0:
         return mask_flat.reshape(series_shape)
+    def _pregenerate_patterns(self, series_shape: tuple[int, ...]) -> torch.BoolTensor:
         """Uses joblib to parallelize the generation of NaN masks for a given shape."""
         if not self._has_block_distribution or not self.nan_ratio_distribution:
             return torch.empty(0, *series_shape, dtype=torch.bool)
+        logger.info(f"Generating {self.num_patterns} NaN patterns for shape {series_shape}...")
         with Parallel(n_jobs=self.n_jobs, backend="loky") as parallel:
+            masks_list = parallel(delayed(self._generate_nan_mask)(series_shape) for _ in range(self.num_patterns))
         logger.info(f"Pattern generation complete for shape {series_shape}.")
         return torch.from_numpy(np.stack(masks_list)).bool()
                 try:
                     patterns = torch.load(target_file, map_location="cpu")
                     self.pattern_cache[(self.max_length, num_channels)] = patterns
+                    logger.info(f"Loaded NaN patterns from {target_file} for shape {(self.max_length, num_channels)}")
                 except (RuntimeError, FileNotFoundError):
                     # Fall back to generating if loading fails
+                    patterns = self._pregenerate_patterns((self.max_length, num_channels))
                     torch.save(patterns, target_file)
                     self.pattern_cache[(self.max_length, num_channels)] = patterns
+                    logger.info(f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}")
             else:
                 patterns = self._pregenerate_patterns((self.max_length, num_channels))
                 torch.save(patterns, target_file)
                 self.pattern_cache[(self.max_length, num_channels)] = patterns
+                logger.info(f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}")
+        patterns = self.pattern_cache[(self.max_length, num_channels)][:, :history_length, :]
         # Early exit if patterns are empty (e.g., generation failed or was disabled)
         if patterns.numel() == 0:
             return time_series_batch
         # 3. Randomly sample patterns for each series being augmented
+        pattern_indices = torch.randint(0, patterns.shape[0], (num_to_augment,), device=device)
         # 4. Select patterns and apply them in a single vectorized operation
         selected_patterns = patterns[pattern_indices].to(device)
+        time_series_batch[indices_to_augment] = time_series_batch[indices_to_augment].masked_fill(
+            selected_patterns, float("nan")
+        )
         return time_series_batch
     def __init__(
         self,
         p_quantize: float,
+        level_range: tuple[int, int],
+        seed: int | None = None,
     ):
         """
         Initializes the augmenter.
         """
         assert 0.0 <= p_quantize <= 1.0, "Probability must be between 0 and 1."
         assert level_range[0] >= 2, "Minimum number of levels must be at least 2."
+        assert level_range[0] <= level_range[1], "Min levels cannot be greater than max."
         self.p_quantize = p_quantize
         self.level_range = level_range
         max_intermediate_levels = self.level_range[1] - 2
         if max_intermediate_levels > 0:
             # SobolEngine must be created on CPU
+            self.sobol_engine = SobolEngine(dimension=max_intermediate_levels, scramble=True, seed=seed)
         else:
             self.sobol_engine = None
         # 2. Determine a variable n_levels for EACH series
         min_l, max_l = self.level_range
+        n_levels_per_series = torch.randint(min_l, max_l + 1, size=(n_augment,), device=device)
         max_levels_in_batch = n_levels_per_series.max().item()
         # 3. Find min/max for each series
         p_combine: float = 0.4,
         p_time_dependent: float = 0.5,
         randomize_k_per_series: bool = True,
+        dirichlet_alpha_range: tuple[float, float] = (0.1, 5.0),
     ):
         """
         Initializes the augmenter.
         """
         assert max_n_series_to_combine >= 2, "Must combine at least 2 series."
         assert 0.0 <= p_combine <= 1.0, "p_combine must be between 0 and 1."
+        assert 0.0 <= p_time_dependent <= 1.0, "p_time_dependent must be between 0 and 1."
+        assert dirichlet_alpha_range[0] > 0 and dirichlet_alpha_range[0] <= dirichlet_alpha_range[1]
         self.max_k = max_n_series_to_combine
         self.p_combine = p_combine
         self.p_time_dependent = p_time_dependent
         # 3. Interpolate between the endpoint weights over time
         # Reshape for broadcasting: w vectors become [k, 1], ramp becomes [1, length]
+        time_varying_weights = w_start.unsqueeze(1) * (1 - alpha_ramp.unsqueeze(0)) + w_end.unsqueeze(
+            1
+        ) * alpha_ramp.unsqueeze(0)
         # The result `time_varying_weights` has shape [k, length]
         # 4. Apply the time-varying weights
             return mixed_series, time_varying_weights
         return mixed_series
+    def transform(self, time_series_batch: torch.Tensor, return_debug_info: bool = False):
         """
         Applies the mixup augmentation, randomly choosing between static and
         time-dependent mixing methods.
         """
         with torch.no_grad():
             if self.p_combine == 0:
+                return (time_series_batch, {}) if return_debug_info else time_series_batch
             batch_size, _, _ = time_series_batch.shape
             device = time_series_batch.device
             if batch_size <= self.max_k:
+                return (time_series_batch, {}) if return_debug_info else time_series_batch
             # 1. Decide which series to replace
             augment_mask = torch.rand(batch_size, device=device) < self.p_combine
             n_augment = indices_to_replace.numel()
             if n_augment == 0:
+                return (time_series_batch, {}) if return_debug_info else time_series_batch
             # 2. Determine k for each series to augment
             if self.randomize_k:
                 # Randomly choose between static and time-dependent mixup
                 if torch.rand(1).item() < self.p_time_dependent:
+                    mixed_series, weights = self._simplex_path_mix(source_series, alpha=alpha, return_weights=True)
                     mix_type = "simplex"
                 else:
+                    mixed_series, weights = self._static_mix(source_series, alpha=alpha, return_weights=True)
                 new_series_list.append(mixed_series)
     def __init__(
         self,
         p_transform: float,
+        gaussian_kernel_size_range: tuple[int, int] = (5, 51),
+        gaussian_sigma_range: tuple[float, float] = (2.0, 20.0),
     ):
         """
         Initializes the augmenter.
         self.sigma_range = gaussian_sigma_range
         # Validate ranges
+        if not (self.kernel_size_range[0] <= self.kernel_size_range[1] and self.kernel_size_range[0] >= 3):
+            raise ValueError("Invalid kernel size range. Ensure min <= max and min >= 3.")
         if not (self.sigma_range[0] <= self.sigma_range[1] and self.sigma_range[0] > 0):
             raise ValueError("Invalid sigma range. Ensure min <= max and min > 0.")
         # Cache for fixed-kernel convolution layers (Sobel, Laplace, etc.)
+        self.conv_cache: dict[tuple[int, torch.device], dict[str, nn.Module]] = {}
+    def _create_fixed_kernel_layers(self, num_channels: int, device: torch.device) -> dict:
         """
         Creates and configures nn.Conv1d layers for fixed-kernel derivative operations.
         These layers are cached to improve performance.
         )
         sobel_kernel = (
+            torch.tensor([-1, 0, 1], device=device, dtype=torch.float32).view(1, 1, -1).repeat(num_channels, 1, 1)
         )
         laplace_kernel = (
+            torch.tensor([1, -2, 1], device=device, dtype=torch.float32).view(1, 1, -1).repeat(num_channels, 1, 1)
         )
         d3_kernel = (
             torch.tensor([-1, 2, 0, -2, 1], device=device, dtype=torch.float32)
         gauss_conv.weight.requires_grad = False
         return gauss_conv
+    def _rescale_signal(self, processed_signal: torch.Tensor, original_signal: torch.Tensor) -> torch.Tensor:
         """Rescales the processed signal to match the min/max range of the original."""
         original_min = torch.amin(original_signal, dim=2, keepdim=True)
         original_max = torch.amax(original_signal, dim=2, keepdim=True)
             sigma = (min_s + (max_s - min_s) * torch.rand(1)).item()
             # --- Get/Create Convolution Layers ---
+            gauss_conv = self._create_gaussian_layer(kernel_size, sigma, num_channels, device)
             cache_key = (num_channels, device)
             if cache_key not in self.conv_cache:
+                self.conv_cache[cache_key] = self._create_fixed_kernel_layers(num_channels, device)
             fixed_layers = self.conv_cache[cache_key]
             # --- Apply Augmentations ---
             flipped_subset = torch.flip(subset_permuted, dims=[2])
             right_integral = torch.flip(torch.cumsum(flipped_subset, dim=2), dims=[2])
             left_integral = torch.cumsum(subset_permuted, dim=2)
+            integral_result = torch.where(use_right_integral, right_integral, left_integral)
+            integral_result_normalized = self._rescale_signal(integral_result, subset_permuted)
             # --- Assemble the results based on op_choices ---
             op_choices_view = op_choices.view(-1, 1, 1)
+            augmented_subset = torch.where(op_choices_view == 0, gauss_result, subset_permuted)
+            augmented_subset = torch.where(op_choices_view == 1, sobel_result, augmented_subset)
+            augmented_subset = torch.where(op_choices_view == 2, laplace_result, augmented_subset)
+            augmented_subset = torch.where(op_choices_view == 3, integral_result_normalized, augmented_subset)
+            augmented_subset = torch.where(op_choices_view == 4, d3_result, augmented_subset)
+            augmented_subset = torch.where(op_choices_view == 5, d4_result, augmented_subset)
             augmented_subset_final = augmented_subset.permute(0, 2, 1)
             augmented_batch = time_series_batch.clone()
     def __init__(
         self,
         p_transform: float = 0.5,
+        kernel_size_range: tuple[int, int] = (3, 31),
+        dilation_range: tuple[int, int] = (1, 8),
+        layer_range: tuple[int, int] = (1, 3),
+        sigma_range: tuple[float, float] = (0.5, 5.0),
+        bias_range: tuple[float, float] = (-0.5, 0.5),
     ):
         """
         Initializes the augmenter.
                                                Gaussian kernels.
             bias_range (Tuple[float, float]): [min, max] range for the bias term.
         """
+        assert kernel_size_range[0] % 2 == 1 and kernel_size_range[1] % 2 == 1, "Kernel sizes must be odd."
         self.p_transform = p_transform
         self.kernel_size_range = kernel_size_range
         self.bias_range = bias_range
         self.padding_modes = ["reflect", "replicate", "circular"]
+    def _rescale_signal(self, processed_signal: torch.Tensor, original_signal: torch.Tensor) -> torch.Tensor:
         """Rescales the processed signal to match the min/max range of the original."""
         original_min = torch.amin(original_signal, dim=-1, keepdim=True)
         original_max = torch.amax(original_signal, dim=-1, keepdim=True)
         num_channels = series.shape[1]
         device = series.device
+        num_layers = torch.randint(self.layer_range[0], self.layer_range[1] + 1, (1,)).item()
         processed_series = series
         for i in range(num_layers):
             else:  # Noisy Sobel kernel
                 # Ensure kernel is large enough for a Sobel filter
                 actual_kernel_size = 3 if kernel_size < 3 else kernel_size
+                sobel_base = torch.tensor([-1, 0, 1], dtype=torch.float32, device=device)
                 noise = torch.randn(3, device=device) * 0.1
                 noisy_sobel = sobel_base + noise
                 # Pad if the random kernel size is larger than 3
                 original_series = subset_permuted[i : i + 1]
                 augmented_series = self._apply_random_conv_stack(original_series)
+                rescaled_series = self._rescale_signal(augmented_series.squeeze(0), original_series.squeeze(0))
                 augmented_subset_list.append(rescaled_series.unsqueeze(0))
             if augmented_subset_list:

src/data/batch_composer.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 import logging
 import random
-from typing import Dict, Optional, Tuple
 import numpy as np
 import pandas as pd
@@ -30,15 +29,15 @@ class BatchComposer:
     def __init__(
         self,
         base_data_dir: str,
-        generator_proportions: Optional[Dict[str, float]] = None,
         mixed_batches: bool = True,
-        device: Optional[torch.device] = None,
-        augmentations: Optional[Dict[str, bool]] = None,
-        augmentation_probabilities: Optional[Dict[str, float]] = None,
-        nan_stats_path: Optional[str] = None,
-        nan_patterns_path: Optional[str] = None,
         global_seed: int = 42,
-        chosen_scaler_name: Optional[str] = None,
         rank: int = 0,
         world_size: int = 1,
     ):
@@ -70,9 +69,7 @@ class BatchComposer:
             "scaler_augmentation": 0.5,
         }
         # Optional preferred scaler name provided by training config
-        self.chosen_scaler_name = (
-            chosen_scaler_name.lower() if chosen_scaler_name is not None else None
-        )
         # Setup random state
         self.rng = np.random.default_rng(global_seed)
@@ -95,7 +92,7 @@ class BatchComposer:
             f"augmentation_probabilities={self.augmentation_probabilities}"
         )
-    def _setup_augmentations(self, augmentations: Optional[Dict[str, bool]]):
         """Setup only the augmentations that should remain online (NaN)."""
         default_augmentations = {
             "nan_augmentation": False,
@@ -109,7 +106,7 @@ class BatchComposer:
         self.nan_augmenter = None
         if self.augmentations.get("nan_augmentation", False):
             stats_path_to_use = self.nan_stats_path or DEFAULT_NAN_STATS_PATH
-            stats = json.load(open(stats_path_to_use, "r"))
             self.nan_augmenter = NanAugmenter(
                 p_series_has_nan=stats["p_series_has_nan"],
                 nan_ratio_distribution=stats["nan_ratio_distribution"],
@@ -124,20 +121,18 @@ class BatchComposer:
         """
         if not self.augmentations.get("scaler_augmentation", False):
             return False
-        probability = float(
-            self.augmentation_probabilities.get("scaler_augmentation", 0.0)
-        )
         probability = max(0.0, min(1.0, probability))
         return bool(self.rng.random() < probability)
-    def _choose_random_scaler(self) -> Optional[object]:
         """
         Choose a random scaler for augmentation, explicitly avoiding the one that
         is already selected in the training configuration (if any).
         Returns an instance of the selected scaler or None when no valid option exists.
         """
-        chosen: Optional[str] = None
         if self.chosen_scaler_name is not None:
             chosen = self.chosen_scaler_name.strip().lower()
@@ -188,11 +183,9 @@ class BatchComposer:
         total = sum(self.generator_proportions.values())
         if total <= 0:
             raise ValueError("Total generator proportions must be positive")
-        self.generator_proportions = {
-            k: v / total for k, v in self.generator_proportions.items()
-        }
-    def _initialize_datasets(self) -> Dict[str, CyclicalBatchDataset]:
         """Initialize CyclicalBatchDataset for each generator with proportion > 0."""
         datasets = {}
@@ -215,24 +208,20 @@ class BatchComposer:
                     world_size=self.world_size,
                 )
                 datasets[generator_name] = dataset
-                logger.info(
-                    f"Loaded dataset for {generator_name} (proportion = {proportion})"
-                )
             except Exception as e:
                 logger.warning(f"Failed to load dataset for {generator_name}: {e}")
                 continue
         if not datasets:
-            raise ValueError(
-                f"No valid datasets found in {self.base_data_dir} or all generators have proportion <= 0"
-            )
         return datasets
     def _convert_sample_to_tensors(
-        self, sample: dict, future_length: Optional[int] = None
-    ) -> Tuple[torch.Tensor, np.datetime64, Frequency]:
         """
         Convert a sample dict to tensors and metadata.
@@ -253,9 +242,7 @@ class BatchComposer:
             if isinstance(values_data[0], list):
                 # New format: [[channel_values]]
                 values = torch.tensor(values_data[0], dtype=torch.float32)
-                logger.debug(
-                    f"{generator_type}: Using new univariate format, shape: {values.shape}"
-                )
             else:
                 # Old format: [values]
                 values = torch.tensor(values_data, dtype=torch.float32)
@@ -269,9 +256,7 @@ class BatchComposer:
             # Stack channels: [1, seq_len, num_channels]
             values = torch.stack(channel_tensors, dim=-1).unsqueeze(0)
-            logger.debug(
-                f"{generator_type}: Using multivariate format, {num_channels} channels, shape: {values.shape}"
-            )
         # Handle frequency conversion
         freq_str = sample["frequency"]
@@ -304,9 +289,7 @@ class BatchComposer:
         return values, start, frequency
-    def _effective_proportions_for_length(
-        self, total_length_for_batch: int
-    ) -> Dict[str, float]:
         """
         Build a simple, length-aware proportion map for the current batch.
@@ -319,7 +302,7 @@ class BatchComposer:
         - Normalize the final map to sum to 1.
         """
-        def augmented_length_from_name(name: str) -> Optional[int]:
             if not name.startswith("augmented"):
                 return None
             suffix = name[len("augmented") :]
@@ -331,20 +314,16 @@ class BatchComposer:
                 return None
         # 1) Adjust proportions with the length-aware rule
-        adjusted: Dict[str, float] = {}
         for name, proportion in self.generator_proportions.items():
             aug_len = augmented_length_from_name(name)
             if aug_len is None:
                 adjusted[name] = proportion
             else:
-                adjusted[name] = (
-                    proportion if aug_len == total_length_for_batch else 0.0
-                )
         # 2) Keep only available, positive-weight datasets
-        adjusted = {
-            name: p for name, p in adjusted.items() if name in self.datasets and p > 0.0
-        }
         # 3) Fallback if empty
         if not adjusted:
@@ -362,20 +341,18 @@ class BatchComposer:
         total = sum(adjusted.values())
         return {name: p / total for name, p in adjusted.items()}
-    def _compute_sample_counts_for_batch(
-        self, proportions: Dict[str, float], batch_size: int
-    ) -> Dict[str, int]:
         """
         Convert a proportion map into integer sample counts that sum to batch_size.
         Strategy: allocate floor(batch_size * p) to each generator in order, and let the
         last generator absorb any remainder to ensure the total matches exactly.
         """
-        counts: Dict[str, int] = {}
         remaining = batch_size
         names = list(proportions.keys())
         values = list(proportions.values())
-        for index, (name, p) in enumerate(zip(names, values)):
             if index == len(names) - 1:
                 counts[name] = remaining
             else:
@@ -384,7 +361,7 @@ class BatchComposer:
                 remaining -= n
         return counts
-    def _calculate_generator_samples(self, batch_size: int) -> Dict[str, int]:
         """
         Calculate the number of samples each generator should contribute.
@@ -401,7 +378,7 @@ class BatchComposer:
         proportions = list(self.generator_proportions.values())
         # Calculate base samples for each generator
-        for i, (generator, proportion) in enumerate(zip(generators, proportions)):
             if generator not in self.datasets:
                 continue
@@ -417,9 +394,9 @@ class BatchComposer:
     def create_batch(
         self,
         batch_size: int = 128,
-        seed: Optional[int] = None,
-        future_length: Optional[int] = None,
-    ) -> Tuple[BatchTimeSeriesContainer, str]:
         """
         Create a batch of the specified size.
@@ -443,8 +420,8 @@ class BatchComposer:
             return self._create_uniform_batch(batch_size, batch_rng, future_length)
     def _create_mixed_batch(
-        self, batch_size: int, future_length: Optional[int] = None
-    ) -> Tuple[BatchTimeSeriesContainer, str]:
         """Create a mixed batch with samples from multiple generators, rejecting NaNs."""
         # Choose total length for this batch; respect length_shortening flag.
@@ -457,11 +434,7 @@ class BatchComposer:
             total_length_for_batch = int(max(LENGTH_CHOICES))
         if future_length is None:
-            prediction_length = int(
-                sample_future_length(
-                    range="gift_eval", total_length=total_length_for_batch
-                )
-            )
         else:
             prediction_length = future_length
@@ -469,9 +442,7 @@ class BatchComposer:
         # Calculate samples per generator using simple, per-batch length-aware proportions
         effective_props = self._effective_proportions_for_length(total_length_for_batch)
-        generator_samples = self._compute_sample_counts_for_batch(
-            effective_props, batch_size
-        )
         all_values = []
         all_starts = []
@@ -504,9 +475,7 @@ class BatchComposer:
                     if len(generator_values) >= num_samples:
                         break
-                    values, sample_start, sample_freq = self._convert_sample_to_tensors(
-                        sample, future_length
-                    )
                     # Skip if NaNs exist (we inject NaNs later in history only)
                     if torch.isnan(values).any():
@@ -518,9 +487,7 @@ class BatchComposer:
                         if strategy == "cut":
                             max_start_idx = values.shape[1] - total_length_for_batch
                             start_idx = int(self.rng.integers(0, max_start_idx + 1))
-                            values = values[
-                                :, start_idx : start_idx + total_length_for_batch, :
-                            ]
                         else:
                             indices = np.linspace(
                                 0,
@@ -534,9 +501,7 @@ class BatchComposer:
                     if self._should_apply_scaler_augmentation():
                         scaler = self._choose_random_scaler()
                         if scaler is not None:
-                            values = scaler.scale(
-                                values, scaler.compute_statistics(values)
-                            )
                     generator_values.append(values)
                     generator_starts.append(sample_start)
@@ -544,7 +509,8 @@ class BatchComposer:
             if len(generator_values) < num_samples:
                 logger.warning(
-                    f"Generator {generator_name}: collected {len(generator_values)}/{num_samples} after {attempts} attempts"
                 )
             # Add the collected valid samples to the main batch lists
@@ -555,16 +521,12 @@ class BatchComposer:
                 actual_proportions[generator_name] = len(generator_values)
         if not all_values:
-            raise RuntimeError(
-                "No valid samples could be collected from any generator."
-            )
         combined_values = torch.cat(all_values, dim=0)
         # Split into history and future
         combined_history = combined_values[:, :history_length, :]
-        combined_future = combined_values[
-            :, history_length : history_length + prediction_length, :
-        ]
         if self.nan_augmenter is not None:
             combined_history = self.nan_augmenter.transform(combined_history)
@@ -583,8 +545,8 @@ class BatchComposer:
         self,
         batch_size: int,
         batch_rng: np.random.Generator,
-        future_length: Optional[int] = None,
-    ) -> Tuple[BatchTimeSeriesContainer, str]:
         """Create a uniform batch with samples from a single generator."""
         # Select generator based on proportions
@@ -606,9 +568,7 @@ class BatchComposer:
         all_frequencies = []
         for sample in samples:
-            values, sample_start, sample_freq = self._convert_sample_to_tensors(
-                sample, future_length
-            )
             total_length = values.shape[1]
             history_length = max(1, total_length - future_length)
@@ -642,14 +602,14 @@ class BatchComposer:
         return container, selected_generator
-    def get_dataset_info(self) -> Dict[str, dict]:
         """Get information about all datasets."""
         info = {}
         for name, dataset in self.datasets.items():
             info[name] = dataset.get_info()
         return info
-    def get_generator_info(self) -> Dict[str, any]:
         """Get information about the composer configuration."""
         return {
             "mixed_batches": self.mixed_batches,
@@ -702,4 +662,4 @@ class ComposedDataset(torch.utils.data.Dataset):
         batch, _ = self.batch_composer.create_batch(
             batch_size=self.batch_size, seed=self.batch_composer.global_seed + idx
         )
-        return batch

 import json
 import logging
 import random
 import numpy as np
 import pandas as pd
     def __init__(
         self,
         base_data_dir: str,
+        generator_proportions: dict[str, float] | None = None,
         mixed_batches: bool = True,
+        device: torch.device | None = None,
+        augmentations: dict[str, bool] | None = None,
+        augmentation_probabilities: dict[str, float] | None = None,
+        nan_stats_path: str | None = None,
+        nan_patterns_path: str | None = None,
         global_seed: int = 42,
+        chosen_scaler_name: str | None = None,
         rank: int = 0,
         world_size: int = 1,
     ):
             "scaler_augmentation": 0.5,
         }
         # Optional preferred scaler name provided by training config
+        self.chosen_scaler_name = chosen_scaler_name.lower() if chosen_scaler_name is not None else None
         # Setup random state
         self.rng = np.random.default_rng(global_seed)
             f"augmentation_probabilities={self.augmentation_probabilities}"
         )
+    def _setup_augmentations(self, augmentations: dict[str, bool] | None):
         """Setup only the augmentations that should remain online (NaN)."""
         default_augmentations = {
             "nan_augmentation": False,
         self.nan_augmenter = None
         if self.augmentations.get("nan_augmentation", False):
             stats_path_to_use = self.nan_stats_path or DEFAULT_NAN_STATS_PATH
+            stats = json.load(open(stats_path_to_use))
             self.nan_augmenter = NanAugmenter(
                 p_series_has_nan=stats["p_series_has_nan"],
                 nan_ratio_distribution=stats["nan_ratio_distribution"],
         """
         if not self.augmentations.get("scaler_augmentation", False):
             return False
+        probability = float(self.augmentation_probabilities.get("scaler_augmentation", 0.0))
         probability = max(0.0, min(1.0, probability))
         return bool(self.rng.random() < probability)
+    def _choose_random_scaler(self) -> object | None:
         """
         Choose a random scaler for augmentation, explicitly avoiding the one that
         is already selected in the training configuration (if any).
         Returns an instance of the selected scaler or None when no valid option exists.
         """
+        chosen: str | None = None
         if self.chosen_scaler_name is not None:
             chosen = self.chosen_scaler_name.strip().lower()
         total = sum(self.generator_proportions.values())
         if total <= 0:
             raise ValueError("Total generator proportions must be positive")
+        self.generator_proportions = {k: v / total for k, v in self.generator_proportions.items()}
+    def _initialize_datasets(self) -> dict[str, CyclicalBatchDataset]:
         """Initialize CyclicalBatchDataset for each generator with proportion > 0."""
         datasets = {}
                     world_size=self.world_size,
                 )
                 datasets[generator_name] = dataset
+                logger.info(f"Loaded dataset for {generator_name} (proportion = {proportion})")
             except Exception as e:
                 logger.warning(f"Failed to load dataset for {generator_name}: {e}")
                 continue
         if not datasets:
+            raise ValueError(f"No valid datasets found in {self.base_data_dir} or all generators have proportion <= 0")
         return datasets
     def _convert_sample_to_tensors(
+        self, sample: dict, future_length: int | None = None
+    ) -> tuple[torch.Tensor, np.datetime64, Frequency]:
         """
         Convert a sample dict to tensors and metadata.
             if isinstance(values_data[0], list):
                 # New format: [[channel_values]]
                 values = torch.tensor(values_data[0], dtype=torch.float32)
+                logger.debug(f"{generator_type}: Using new univariate format, shape: {values.shape}")
             else:
                 # Old format: [values]
                 values = torch.tensor(values_data, dtype=torch.float32)
             # Stack channels: [1, seq_len, num_channels]
             values = torch.stack(channel_tensors, dim=-1).unsqueeze(0)
+            logger.debug(f"{generator_type}: Using multivariate format, {num_channels} channels, shape: {values.shape}")
         # Handle frequency conversion
         freq_str = sample["frequency"]
         return values, start, frequency
+    def _effective_proportions_for_length(self, total_length_for_batch: int) -> dict[str, float]:
         """
         Build a simple, length-aware proportion map for the current batch.
         - Normalize the final map to sum to 1.
         """
+        def augmented_length_from_name(name: str) -> int | None:
             if not name.startswith("augmented"):
                 return None
             suffix = name[len("augmented") :]
                 return None
         # 1) Adjust proportions with the length-aware rule
+        adjusted: dict[str, float] = {}
         for name, proportion in self.generator_proportions.items():
             aug_len = augmented_length_from_name(name)
             if aug_len is None:
                 adjusted[name] = proportion
             else:
+                adjusted[name] = proportion if aug_len == total_length_for_batch else 0.0
         # 2) Keep only available, positive-weight datasets
+        adjusted = {name: p for name, p in adjusted.items() if name in self.datasets and p > 0.0}
         # 3) Fallback if empty
         if not adjusted:
         total = sum(adjusted.values())
         return {name: p / total for name, p in adjusted.items()}
+    def _compute_sample_counts_for_batch(self, proportions: dict[str, float], batch_size: int) -> dict[str, int]:
         """
         Convert a proportion map into integer sample counts that sum to batch_size.
         Strategy: allocate floor(batch_size * p) to each generator in order, and let the
         last generator absorb any remainder to ensure the total matches exactly.
         """
+        counts: dict[str, int] = {}
         remaining = batch_size
         names = list(proportions.keys())
         values = list(proportions.values())
+        for index, (name, p) in enumerate(zip(names, values, strict=True)):
             if index == len(names) - 1:
                 counts[name] = remaining
             else:
                 remaining -= n
         return counts
+    def _calculate_generator_samples(self, batch_size: int) -> dict[str, int]:
         """
         Calculate the number of samples each generator should contribute.
         proportions = list(self.generator_proportions.values())
         # Calculate base samples for each generator
+        for i, (generator, proportion) in enumerate(zip(generators, proportions, strict=True)):
             if generator not in self.datasets:
                 continue
     def create_batch(
         self,
         batch_size: int = 128,
+        seed: int | None = None,
+        future_length: int | None = None,
+    ) -> tuple[BatchTimeSeriesContainer, str]:
         """
         Create a batch of the specified size.
             return self._create_uniform_batch(batch_size, batch_rng, future_length)
     def _create_mixed_batch(
+        self, batch_size: int, future_length: int | None = None
+    ) -> tuple[BatchTimeSeriesContainer, str]:
         """Create a mixed batch with samples from multiple generators, rejecting NaNs."""
         # Choose total length for this batch; respect length_shortening flag.
             total_length_for_batch = int(max(LENGTH_CHOICES))
         if future_length is None:
+            prediction_length = int(sample_future_length(range="gift_eval", total_length=total_length_for_batch))
         else:
             prediction_length = future_length
         # Calculate samples per generator using simple, per-batch length-aware proportions
         effective_props = self._effective_proportions_for_length(total_length_for_batch)
+        generator_samples = self._compute_sample_counts_for_batch(effective_props, batch_size)
         all_values = []
         all_starts = []
                     if len(generator_values) >= num_samples:
                         break
+                    values, sample_start, sample_freq = self._convert_sample_to_tensors(sample, future_length)
                     # Skip if NaNs exist (we inject NaNs later in history only)
                     if torch.isnan(values).any():
                         if strategy == "cut":
                             max_start_idx = values.shape[1] - total_length_for_batch
                             start_idx = int(self.rng.integers(0, max_start_idx + 1))
+                            values = values[:, start_idx : start_idx + total_length_for_batch, :]
                         else:
                             indices = np.linspace(
                                 0,
                     if self._should_apply_scaler_augmentation():
                         scaler = self._choose_random_scaler()
                         if scaler is not None:
+                            values = scaler.scale(values, scaler.compute_statistics(values))
                     generator_values.append(values)
                     generator_starts.append(sample_start)
             if len(generator_values) < num_samples:
                 logger.warning(
+                    f"Generator {generator_name}: collected {len(generator_values)}/"
+                    f"{num_samples} after {attempts} attempts"
                 )
             # Add the collected valid samples to the main batch lists
                 actual_proportions[generator_name] = len(generator_values)
         if not all_values:
+            raise RuntimeError("No valid samples could be collected from any generator.")
         combined_values = torch.cat(all_values, dim=0)
         # Split into history and future
         combined_history = combined_values[:, :history_length, :]
+        combined_future = combined_values[:, history_length : history_length + prediction_length, :]
         if self.nan_augmenter is not None:
             combined_history = self.nan_augmenter.transform(combined_history)
         self,
         batch_size: int,
         batch_rng: np.random.Generator,
+        future_length: int | None = None,
+    ) -> tuple[BatchTimeSeriesContainer, str]:
         """Create a uniform batch with samples from a single generator."""
         # Select generator based on proportions
         all_frequencies = []
         for sample in samples:
+            values, sample_start, sample_freq = self._convert_sample_to_tensors(sample, future_length)
             total_length = values.shape[1]
             history_length = max(1, total_length - future_length)
         return container, selected_generator
+    def get_dataset_info(self) -> dict[str, dict]:
         """Get information about all datasets."""
         info = {}
         for name, dataset in self.datasets.items():
             info[name] = dataset.get_info()
         return info
+    def get_generator_info(self) -> dict[str, any]:
         """Get information about the composer configuration."""
         return {
             "mixed_batches": self.mixed_batches,
         batch, _ = self.batch_composer.create_batch(
             batch_size=self.batch_size, seed=self.batch_composer.global_seed + idx
         )
+        return batch

src/data/constants.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from datetime import date
-from typing import Dict
 import numpy as np
@@ -15,7 +14,7 @@ LENGTH_CHOICES = [128, 256, 512, 1024, 1536, 2048]
 DEFAULT_NAN_STATS_PATH: str = "./data/nan_stats.json"
-LENGTH_WEIGHTS: Dict[int, float] = {
     128: 0.05,
     256: 0.10,
     512: 0.10,

 from datetime import date
 import numpy as np
 DEFAULT_NAN_STATS_PATH: str = "./data/nan_stats.json"
+LENGTH_WEIGHTS: dict[int, float] = {
     128: 0.05,
     256: 0.10,
     512: 0.10,

src/data/containers.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from typing import List, Optional
 import numpy as np
 import torch
@@ -29,11 +28,11 @@ class BatchTimeSeriesContainer:
     history_values: torch.Tensor
     future_values: torch.Tensor
-    start: List[np.datetime64]
-    frequency: List[Frequency]
-    history_mask: Optional[torch.Tensor] = None
-    future_mask: Optional[torch.Tensor] = None
     def __post_init__(self):
         """Validate all tensor shapes and consistency."""
@@ -42,13 +41,9 @@ class BatchTimeSeriesContainer:
             raise TypeError("history_values must be a torch.Tensor")
         if not isinstance(self.future_values, torch.Tensor):
             raise TypeError("future_values must be a torch.Tensor")
-        if not isinstance(self.start, list) or not all(
-            isinstance(x, np.datetime64) for x in self.start
-        ):
             raise TypeError("start must be a List[np.datetime64]")
-        if not isinstance(self.frequency, list) or not all(
-            isinstance(x, Frequency) for x in self.frequency
-        ):
             raise TypeError("frequency must be a List[Frequency]")
         batch_size, seq_len, num_channels = self.history_values.shape
@@ -73,16 +68,14 @@ class BatchTimeSeriesContainer:
             if not isinstance(self.future_mask, torch.Tensor):
                 raise TypeError("future_mask must be a Tensor or None")
             if not (
-                self.future_mask.shape == (batch_size, pred_len)
-                or self.future_mask.shape == self.future_values.shape
             ):
                 raise ValueError(
-                    f"Shape mismatch in future_mask: expected {(batch_size, pred_len)} or {self.future_values.shape}, got {self.future_mask.shape}"
                 )
-    def to_device(
-        self, device: torch.device, attributes: Optional[List[str]] = None
-    ) -> None:
         """
         Move specified tensors to the target device in place.
@@ -109,7 +102,7 @@ class BatchTimeSeriesContainer:
             if all_tensors[attr] is not None:
                 setattr(self, attr, all_tensors[attr].to(device))
-    def to(self, device: torch.device, attributes: Optional[List[str]] = None):
         """
         Alias for to_device method for consistency with PyTorch conventions.
@@ -157,39 +150,33 @@ class TimeSeriesContainer:
     """
     values: np.ndarray
-    start: List[np.datetime64]
-    frequency: List[Frequency]
     def __post_init__(self):
         """Validate all shapes and consistency."""
         # --- Numpy Type Checks ---
         if not isinstance(self.values, np.ndarray):
             raise TypeError("values must be a np.ndarray")
-        if not isinstance(self.start, list) or not all(
-            isinstance(x, np.datetime64) for x in self.start
-        ):
             raise TypeError("start must be a List[np.datetime64]")
-        if not isinstance(self.frequency, list) or not all(
-            isinstance(x, Frequency) for x in self.frequency
-        ):
             raise TypeError("frequency must be a List[Frequency]")
         # --- Shape and Length Consistency Checks ---
         if len(self.values.shape) < 2 or len(self.values.shape) > 3:
             raise ValueError(
-                f"values must have 2 or 3 dimensions [batch_size, seq_len] or [batch_size, seq_len, num_channels], got shape {self.values.shape}"
             )
         batch_size = self.values.shape[0]
         if len(self.start) != batch_size:
-            raise ValueError(
-                f"Length of start ({len(self.start)}) must match batch_size ({batch_size})"
-            )
         if len(self.frequency) != batch_size:
-            raise ValueError(
-                f"Length of frequency ({len(self.frequency)}) must match batch_size ({batch_size})"
-            )
     @property
     def batch_size(self) -> int:

 from dataclasses import dataclass
 import numpy as np
 import torch
     history_values: torch.Tensor
     future_values: torch.Tensor
+    start: list[np.datetime64]
+    frequency: list[Frequency]
+    history_mask: torch.Tensor | None = None
+    future_mask: torch.Tensor | None = None
     def __post_init__(self):
         """Validate all tensor shapes and consistency."""
             raise TypeError("history_values must be a torch.Tensor")
         if not isinstance(self.future_values, torch.Tensor):
             raise TypeError("future_values must be a torch.Tensor")
+        if not isinstance(self.start, list) or not all(isinstance(x, np.datetime64) for x in self.start):
             raise TypeError("start must be a List[np.datetime64]")
+        if not isinstance(self.frequency, list) or not all(isinstance(x, Frequency) for x in self.frequency):
             raise TypeError("frequency must be a List[Frequency]")
         batch_size, seq_len, num_channels = self.history_values.shape
             if not isinstance(self.future_mask, torch.Tensor):
                 raise TypeError("future_mask must be a Tensor or None")
             if not (
+                self.future_mask.shape == (batch_size, pred_len) or self.future_mask.shape == self.future_values.shape
             ):
                 raise ValueError(
+                    "Shape mismatch in future_mask: "
+                    f"expected {(batch_size, pred_len)} or {self.future_values.shape}, got {self.future_mask.shape}"
                 )
+    def to_device(self, device: torch.device, attributes: list[str] | None = None) -> None:
         """
         Move specified tensors to the target device in place.
             if all_tensors[attr] is not None:
                 setattr(self, attr, all_tensors[attr].to(device))
+    def to(self, device: torch.device, attributes: list[str] | None = None):
         """
         Alias for to_device method for consistency with PyTorch conventions.
     """
     values: np.ndarray
+    start: list[np.datetime64]
+    frequency: list[Frequency]
     def __post_init__(self):
         """Validate all shapes and consistency."""
         # --- Numpy Type Checks ---
         if not isinstance(self.values, np.ndarray):
             raise TypeError("values must be a np.ndarray")
+        if not isinstance(self.start, list) or not all(isinstance(x, np.datetime64) for x in self.start):
             raise TypeError("start must be a List[np.datetime64]")
+        if not isinstance(self.frequency, list) or not all(isinstance(x, Frequency) for x in self.frequency):
             raise TypeError("frequency must be a List[Frequency]")
         # --- Shape and Length Consistency Checks ---
         if len(self.values.shape) < 2 or len(self.values.shape) > 3:
             raise ValueError(
+                "values must have 2 or 3 dimensions "
+                "[batch_size, seq_len] or [batch_size, seq_len, num_channels], "
+                f"got shape {self.values.shape}"
             )
         batch_size = self.values.shape[0]
         if len(self.start) != batch_size:
+            raise ValueError(f"Length of start ({len(self.start)}) must match batch_size ({batch_size})")
         if len(self.frequency) != batch_size:
+            raise ValueError(f"Length of frequency ({len(self.frequency)}) must match batch_size ({batch_size})")
     @property
     def batch_size(self) -> int:

src/data/datasets.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import os
 import random
-from typing import List, Optional
 import pyarrow.feather as feather
 import torch
@@ -21,7 +20,7 @@ class CyclicalBatchDataset:
         self,
         batches_dir: str,
         generator_type: str,
-        device: Optional[torch.device] = None,
         prefetch_next: bool = True,
         prefetch_threshold: int = 32,
         rank: int = 0,
@@ -72,7 +71,7 @@ class CyclicalBatchDataset:
             f"has {len(self.current_batch_data)} samples."
         )
-    def _find_batch_files(self) -> List[str]:
         """
         Find and sort batch files with per-rank sharding for distributed training.
@@ -89,9 +88,7 @@ class CyclicalBatchDataset:
         # Shard files across ranks: each rank gets every world_size-th file
         # Example with 4 ranks: rank0=[0,4,8,...], rank1=[1,5,9,...], etc.
-        rank_files = [
-            f for i, f in enumerate(all_files) if i % self.world_size == self.rank
-        ]
         # Shuffle only within this rank's shard for variety
         random.shuffle(rank_files)
@@ -103,7 +100,7 @@ class CyclicalBatchDataset:
         return rank_files
-    def _load_batch_from_file(self, batch_file: str) -> List[dict]:
         """Load a batch from arrow file."""
         try:
             table = feather.read_table(batch_file)
@@ -163,9 +160,7 @@ class CyclicalBatchDataset:
         next_batch_file = self.batch_files[next_batch_idx]
         try:
             self.next_batch_data = self._load_batch_from_file(next_batch_file)
-            logger.debug(
-                f"Prefetched next batch {next_batch_idx} for {self.generator_type}"
-            )
         except Exception as e:
             logger.warning(f"Failed to prefetch batch {next_batch_idx}: {e}")
             self.next_batch_data = None
@@ -229,7 +224,7 @@ class CyclicalBatchDataset:
         self.current_sample_idx += 1
         return sample
-    def get_samples(self, num_samples: int) -> List[dict]:
         """Get multiple samples."""
         samples = []
         for _ in range(num_samples):
@@ -260,8 +255,6 @@ class CyclicalBatchDataset:
             "current_batch_size": self.get_total_samples_in_current_batch(),
             "remaining_in_batch": self.get_remaining_samples_in_current_batch(),
             "unique_files_visited": visited_count,
-            "cycle_progress_percent": (visited_count / total_files) * 100
-            if total_files > 0
-            else 0,
             "full_cycles_completed": self.full_cycles_completed,
-        }

 import logging
 import os
 import random
 import pyarrow.feather as feather
 import torch
         self,
         batches_dir: str,
         generator_type: str,
+        device: torch.device | None = None,
         prefetch_next: bool = True,
         prefetch_threshold: int = 32,
         rank: int = 0,
             f"has {len(self.current_batch_data)} samples."
         )
+    def _find_batch_files(self) -> list[str]:
         """
         Find and sort batch files with per-rank sharding for distributed training.
         # Shard files across ranks: each rank gets every world_size-th file
         # Example with 4 ranks: rank0=[0,4,8,...], rank1=[1,5,9,...], etc.
+        rank_files = [f for i, f in enumerate(all_files) if i % self.world_size == self.rank]
         # Shuffle only within this rank's shard for variety
         random.shuffle(rank_files)
         return rank_files
+    def _load_batch_from_file(self, batch_file: str) -> list[dict]:
         """Load a batch from arrow file."""
         try:
             table = feather.read_table(batch_file)
         next_batch_file = self.batch_files[next_batch_idx]
         try:
             self.next_batch_data = self._load_batch_from_file(next_batch_file)
+            logger.debug(f"Prefetched next batch {next_batch_idx} for {self.generator_type}")
         except Exception as e:
             logger.warning(f"Failed to prefetch batch {next_batch_idx}: {e}")
             self.next_batch_data = None
         self.current_sample_idx += 1
         return sample
+    def get_samples(self, num_samples: int) -> list[dict]:
         """Get multiple samples."""
         samples = []
         for _ in range(num_samples):
             "current_batch_size": self.get_total_samples_in_current_batch(),
             "remaining_in_batch": self.get_remaining_samples_in_current_batch(),
             "unique_files_visited": visited_count,
+            "cycle_progress_percent": (visited_count / total_files) * 100 if total_files > 0 else 0,
             "full_cycles_completed": self.full_cycles_completed,
+        }

src/data/filter.py CHANGED Viewed

@@ -66,8 +66,6 @@ def is_low_quality(
     complexity_score = lempel_ziv_complexity(binary_seq)
     normalized_complexity = complexity_score / max(1, len(binary_seq))
-    is_random_like = (snr_proxy < snr_threshold) and (
-        normalized_complexity > complexity_threshold
-    )
     is_uncorrelated = autocorr_strength < autocorr_threshold
     return bool(is_uncorrelated and is_random_like)

     complexity_score = lempel_ziv_complexity(binary_seq)
     normalized_complexity = complexity_score / max(1, len(binary_seq))
+    is_random_like = (snr_proxy < snr_threshold) and (normalized_complexity > complexity_threshold)
     is_uncorrelated = autocorr_strength < autocorr_threshold
     return bool(is_uncorrelated and is_random_like)

src/data/frequency.py CHANGED Viewed

@@ -13,7 +13,6 @@ This module centralizes all frequency-related functionality including:
 import logging
 import re
 from enum import Enum
-from typing import Dict, Tuple
 import numpy as np
 import pandas as pd
@@ -132,7 +131,7 @@ class Frequency(Enum):
         """Get GIFT eval dataset frequency weight."""
         return GIFT_EVAL_FREQUENCY_WEIGHTS.get(self, 0.1)
-    def get_length_range(self) -> Tuple[int, int, int, int]:
         """Get (min_length, max_length, optimal_start, optimal_end) for this frequency."""
         return GIFT_EVAL_LENGTH_RANGES.get(self, (50, 1000, 100, 500))
@@ -142,7 +141,7 @@ class Frequency(Enum):
 # ============================================================================
 # Core frequency mapping: (pandas_base, prefix, days_per_period)
-FREQUENCY_MAPPING: Dict[Frequency, Tuple[str, str, float]] = {
     Frequency.A: (
         "YE",
         "",
@@ -162,7 +161,7 @@ FREQUENCY_MAPPING: Dict[Frequency, Tuple[str, str, float]] = {
 }
 # Frequency to pandas offset mapping for calculating time deltas
-FREQUENCY_TO_OFFSET: Dict[Frequency, str] = {
     Frequency.A: "AS",  # Annual start
     Frequency.Q: "QS",  # Quarter start
     Frequency.M: "MS",  # Month start
@@ -203,7 +202,7 @@ ALL_FREQUENCY_MAX_LENGTHS = {
 }
 # GIFT eval-based frequency weights from actual dataset analysis
-GIFT_EVAL_FREQUENCY_WEIGHTS: Dict[Frequency, float] = {
     Frequency.H: 25.0,  # Hourly - most common
     Frequency.D: 23.4,  # Daily - second most common
     Frequency.W: 12.9,  # Weekly - third most common
@@ -219,7 +218,7 @@ GIFT_EVAL_FREQUENCY_WEIGHTS: Dict[Frequency, float] = {
 # GIFT eval-based length ranges derived from actual dataset analysis
 # Format: (min_length, max_length, optimal_start, optimal_end)
-GIFT_EVAL_LENGTH_RANGES: Dict[Frequency, Tuple[int, int, int, int]] = {
     # Low frequency ranges (based on actual GIFT eval data + logical extensions)
     Frequency.A: (25, 100, 30, 70),
     Frequency.Q: (25, 150, 50, 120),
@@ -264,9 +263,7 @@ def parse_frequency(freq_str: str) -> Frequency:
     """
     # Handle minute-based frequencies BEFORE pandas standardization
     # because pandas converts "5T" to just "min", losing the multiplier
-    minute_match = re.match(r"^(\d*)T$", freq_str, re.IGNORECASE) or re.match(
-        r"^(\d*)min$", freq_str, re.IGNORECASE
-    )
     if minute_match:
         multiplier = int(minute_match.group(1)) if minute_match.group(1) else 1
         enum_key = f"T{multiplier}"
@@ -309,9 +306,7 @@ def parse_frequency(freq_str: str) -> Frequency:
     raise NotImplementedError(f"Frequency '{standardized_freq}' is not supported.")
-def validate_frequency_safety(
-    start_date: np.datetime64, total_length: int, frequency: Frequency
-) -> bool:
     """
     Check if start date and frequency combination is safe for pandas datetime operations.
@@ -427,9 +422,7 @@ def select_safe_random_frequency(total_length: int, rng: Generator) -> Frequency
             # Outside optimal but within valid range - calculate penalty
             if total_length < optimal_start:
                 # Below optimal range
-                distance_ratio = (optimal_start - total_length) / (
-                    optimal_start - min_len
-                )
             else:
                 # Above optimal range
                 distance_ratio = (total_length - optimal_end) / (max_len - optimal_end)
@@ -479,7 +472,7 @@ def select_safe_random_frequency(total_length: int, rng: Generator) -> Frequency
 def select_safe_start_date(
     total_length: int,
     frequency: Frequency,
-    rng: Generator = np.random.default_rng(),
     max_retries: int = 10,
 ) -> np.datetime64:
     """
@@ -499,6 +492,9 @@ def select_safe_start_date(
         ValueError: If no safe start date is found after max_retries or if the required
                    time span exceeds the available date window
     """
     days_per_period = frequency.get_days_per_period()
     # Calculate approximate duration in days
@@ -510,9 +506,7 @@ def select_safe_start_date(
     # Check if the required time span exceeds the available window
     if latest_safe_start < earliest_safe_start:
-        available_days = (
-            (BASE_END_DATE - BASE_START_DATE).astype("timedelta64[D]").astype(int)
-        )
         available_years = available_days / 365.25
         required_years = total_days / 365.25
         raise ValueError(

 import logging
 import re
 from enum import Enum
 import numpy as np
 import pandas as pd
         """Get GIFT eval dataset frequency weight."""
         return GIFT_EVAL_FREQUENCY_WEIGHTS.get(self, 0.1)
+    def get_length_range(self) -> tuple[int, int, int, int]:
         """Get (min_length, max_length, optimal_start, optimal_end) for this frequency."""
         return GIFT_EVAL_LENGTH_RANGES.get(self, (50, 1000, 100, 500))
 # ============================================================================
 # Core frequency mapping: (pandas_base, prefix, days_per_period)
+FREQUENCY_MAPPING: dict[Frequency, tuple[str, str, float]] = {
     Frequency.A: (
         "YE",
         "",
 }
 # Frequency to pandas offset mapping for calculating time deltas
+FREQUENCY_TO_OFFSET: dict[Frequency, str] = {
     Frequency.A: "AS",  # Annual start
     Frequency.Q: "QS",  # Quarter start
     Frequency.M: "MS",  # Month start
 }
 # GIFT eval-based frequency weights from actual dataset analysis
+GIFT_EVAL_FREQUENCY_WEIGHTS: dict[Frequency, float] = {
     Frequency.H: 25.0,  # Hourly - most common
     Frequency.D: 23.4,  # Daily - second most common
     Frequency.W: 12.9,  # Weekly - third most common
 # GIFT eval-based length ranges derived from actual dataset analysis
 # Format: (min_length, max_length, optimal_start, optimal_end)
+GIFT_EVAL_LENGTH_RANGES: dict[Frequency, tuple[int, int, int, int]] = {
     # Low frequency ranges (based on actual GIFT eval data + logical extensions)
     Frequency.A: (25, 100, 30, 70),
     Frequency.Q: (25, 150, 50, 120),
     """
     # Handle minute-based frequencies BEFORE pandas standardization
     # because pandas converts "5T" to just "min", losing the multiplier
+    minute_match = re.match(r"^(\d*)T$", freq_str, re.IGNORECASE) or re.match(r"^(\d*)min$", freq_str, re.IGNORECASE)
     if minute_match:
         multiplier = int(minute_match.group(1)) if minute_match.group(1) else 1
         enum_key = f"T{multiplier}"
     raise NotImplementedError(f"Frequency '{standardized_freq}' is not supported.")
+def validate_frequency_safety(start_date: np.datetime64, total_length: int, frequency: Frequency) -> bool:
     """
     Check if start date and frequency combination is safe for pandas datetime operations.
             # Outside optimal but within valid range - calculate penalty
             if total_length < optimal_start:
                 # Below optimal range
+                distance_ratio = (optimal_start - total_length) / (optimal_start - min_len)
             else:
                 # Above optimal range
                 distance_ratio = (total_length - optimal_end) / (max_len - optimal_end)
 def select_safe_start_date(
     total_length: int,
     frequency: Frequency,
+    rng: Generator | None = None,
     max_retries: int = 10,
 ) -> np.datetime64:
     """
         ValueError: If no safe start date is found after max_retries or if the required
                    time span exceeds the available date window
     """
+    if rng is None:
+        rng = np.random.default_rng()
     days_per_period = frequency.get_days_per_period()
     # Calculate approximate duration in days
     # Check if the required time span exceeds the available window
     if latest_safe_start < earliest_safe_start:
+        available_days = (BASE_END_DATE - BASE_START_DATE).astype("timedelta64[D]").astype(int)
         available_years = available_days / 365.25
         required_years = total_days / 365.25
         raise ValueError(

src/data/loaders.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import random
-from typing import Dict, Iterator, List, Optional
 import numpy as np
 import pandas as pd
@@ -27,14 +27,14 @@ class GiftEvalDataLoader:
         self,
         mode: str = "train",
         batch_size: int = 32,
-        device: Optional[torch.device] = None,
         shuffle: bool = True,
         to_univariate: bool = False,
-        max_context_length: Optional[int] = None,
         max_windows: int = 20,
         skip_datasets_with_nans: bool = False,
-        datasets_to_use: Optional[List[str]] = None,
-        dataset_storage_path: Optional[str] = None,
     ):
         """
         Initialize GIFT-eval data loader.
@@ -59,9 +59,7 @@ class GiftEvalDataLoader:
                 logger.warning(f"Invalid datasets requested: {invalid_datasets}")
                 logger.warning(f"Available datasets: {ALL_DATASETS}")
                 # Use only valid datasets
-                self.dataset_names = [
-                    ds for ds in datasets_to_use if ds in ALL_DATASETS
-                ]
             else:
                 self.dataset_names = datasets_to_use
         else:
@@ -69,14 +67,10 @@ class GiftEvalDataLoader:
         # Log dataset selection
         if datasets_to_use is not None and len(datasets_to_use) > 0:
-            logger.info(
-                f"Using subset of datasets: {len(self.dataset_names)}/{len(ALL_DATASETS)} datasets"
-            )
             logger.info(f"Selected datasets: {self.dataset_names}")
         else:
-            logger.info(
-                f"Using all available datasets: {len(self.dataset_names)} datasets"
-            )
         self.terms = self.TERMS
         self.mode = mode
@@ -135,9 +129,7 @@ class GiftEvalDataLoader:
                     )
                     self.datasets[dataset_key] = dataset
-                    self.dataset_prediction_lengths[dataset_key] = (
-                        dataset.prediction_length
-                    )
                     logger.info(
                         f"Loaded {dataset_key} - prediction_length: {dataset.prediction_length}, "
@@ -160,13 +152,11 @@ class GiftEvalDataLoader:
             target_np = np.asarray(target, dtype=np.float32)
             return np.isnan(target_np).any()
         except Exception:
-            logger.warning(
-                "NaN check: failed to coerce target to float32; skipping entry"
-            )
             return True
     def _convert_to_container(
-        self, data_entries: List[dict], prediction_length: int, dataset_freq: str
     ) -> BatchTimeSeriesContainer:
         """Convert a batch of data entries to BatchTimeSeriesContainer format with fixed future length."""
         batch_size = len(data_entries)
@@ -181,18 +171,12 @@ class GiftEvalDataLoader:
             _, seq_len = target.shape
             # Only consider up to the last (max_context_length) values
-            effective_max_context = (
-                self.max_context_length
-                if self.max_context_length is not None
-                else seq_len
-            )
             if seq_len > effective_max_context:
                 seq_len = effective_max_context
             # History is up to (max_context_length - prediction_length)
-            history_len = max(
-                0, min(seq_len, effective_max_context) - prediction_length
-            )
             max_history_len = max(max_history_len, history_len)
         # Get number of channels from first entry
@@ -203,12 +187,8 @@ class GiftEvalDataLoader:
         num_channels = first_target.shape[0]
         # Allocate arrays
-        history_values = np.full(
-            (batch_size, max_history_len, num_channels), np.nan, dtype=np.float32
-        )
-        future_values = np.full(
-            (batch_size, prediction_length, num_channels), np.nan, dtype=np.float32
-        )
         history_mask = np.zeros((batch_size, max_history_len), dtype=bool)
         # Second pass: fill arrays
@@ -219,26 +199,18 @@ class GiftEvalDataLoader:
             # Truncate to last effective_max_context points if needed
             full_seq_len = target.shape[1]
-            total_len_allowed = (
-                self.max_context_length
-                if self.max_context_length is not None
-                else full_seq_len
-            )
             total_len_for_entry = min(full_seq_len, total_len_allowed)
             if total_len_for_entry < prediction_length + 1:
                 # Not enough length to build (history + future). Signal to caller.
-                raise ValueError(
-                    "Entry too short after max_context_length truncation to form history+future window"
-                )
             truncated = target[:, -total_len_for_entry:]
             cur_history_len = total_len_for_entry - prediction_length
             hist = truncated[:, :cur_history_len]  # [C, H]
-            fut = truncated[
-                :, cur_history_len : cur_history_len + prediction_length
-            ]  # [C, P]
             # Write into batch arrays with time last -> transpose to [H, C] / [P, C]
             history_values[i, :cur_history_len, :] = hist.T
@@ -263,9 +235,7 @@ class GiftEvalDataLoader:
             future_values=torch.tensor(future_values, dtype=torch.float32),
             start=start_list,
             frequency=frequency_list,
-            history_mask=torch.tensor(history_mask, dtype=torch.bool)
-            if self.mode == "train"
-            else None,
         )
     def _prepare_epoch_data(self) -> None:
@@ -311,14 +281,10 @@ class GiftEvalDataLoader:
                 for i in range(0, len(valid_entries), self.batch_size):
                     batch_entries = valid_entries[i : i + self.batch_size]
                     try:
-                        batch_container = self._convert_to_container(
-                            batch_entries, prediction_length, dataset_freq
-                        )
                         self._epoch_data.append((dataset_key, batch_container))
                     except Exception as e:
-                        logger.warning(
-                            f"Failed to create batch for {dataset_key}: {str(e)}"
-                        )
                         continue
             except Exception as e:
@@ -419,17 +385,17 @@ def create_synthetic_dataloader(
     base_data_dir: str,
     batch_size: int = 128,
     num_batches_per_epoch: int = 1000,
-    generator_proportions: Optional[Dict[str, float]] = None,
     mixed_batches: bool = True,
-    augmentations: Optional[Dict[str, bool]] = None,
-    augmentation_probabilities: Optional[Dict[str, float]] = None,
-    device: Optional[torch.device] = None,
     num_workers: int = 0,
     pin_memory: bool = True,
     global_seed: int = 42,
-    nan_stats_path: Optional[str] = None,
-    nan_patterns_path: Optional[str] = None,
-    chosen_scaler_name: Optional[str] = None,
 ) -> torch.utils.data.DataLoader:
     """
     Create a PyTorch DataLoader for training with saved generator batches.
@@ -512,14 +478,14 @@ class SyntheticValidationDataset(torch.utils.data.Dataset):
         batch_size: int = 128,
         num_batches: int = 2,
         future_length: int = 512,
-        generator_proportions: Optional[Dict[str, float]] = None,
-        augmentations: Optional[Dict[str, bool]] = None,
-        augmentation_probabilities: Optional[Dict[str, float]] = None,
-        device: Optional[torch.device] = None,
         global_seed: int = 42,
-        chosen_scaler_name: Optional[str] = None,
-        nan_stats_path: Optional[str] = None,
-        nan_patterns_path: Optional[str] = None,
         rank: int = 0,
         world_size: int = 1,
     ):
@@ -564,15 +530,11 @@ class SyntheticValidationDataset(torch.utils.data.Dataset):
             batch, _ = self.batch_composer.create_batch(
                 batch_size=batch_size,
                 future_length=future_length,
-                seed=global_seed
-                + 999999
-                + i,  # Fixed seeds for reproducible validation
             )
             self.validation_batches.append(batch)
-        logger.info(
-            f"Created {num_batches} fixed validation batches with batch_size={batch_size}"
-        )
     def __len__(self) -> int:
         return self.num_batches
@@ -603,14 +565,14 @@ def create_synthetic_dataset(
     base_data_dir: str,
     batch_size: int = 128,
     num_batches_per_epoch: int = 1000,
-    generator_proportions: Optional[Dict[str, float]] = None,
     mixed_batches: bool = True,
-    augmentations: Optional[Dict[str, bool]] = None,
-    augmentation_probabilities: Optional[Dict[str, float]] = None,
     global_seed: int = 42,
-    nan_stats_path: Optional[str] = None,
-    nan_patterns_path: Optional[str] = None,
-    chosen_scaler_name: Optional[str] = None,
     rank: int = 0,
     world_size: int = 1,
 ) -> ComposedDataset:
@@ -658,4 +620,4 @@ def create_synthetic_dataset(
         f"batch_size={batch_size}, mixed_batches={mixed_batches}"
     )
-    return dataset

 import logging
 import random
+from collections.abc import Iterator
 import numpy as np
 import pandas as pd
         self,
         mode: str = "train",
         batch_size: int = 32,
+        device: torch.device | None = None,
         shuffle: bool = True,
         to_univariate: bool = False,
+        max_context_length: int | None = None,
         max_windows: int = 20,
         skip_datasets_with_nans: bool = False,
+        datasets_to_use: list[str] | None = None,
+        dataset_storage_path: str | None = None,
     ):
         """
         Initialize GIFT-eval data loader.
                 logger.warning(f"Invalid datasets requested: {invalid_datasets}")
                 logger.warning(f"Available datasets: {ALL_DATASETS}")
                 # Use only valid datasets
+                self.dataset_names = [ds for ds in datasets_to_use if ds in ALL_DATASETS]
             else:
                 self.dataset_names = datasets_to_use
         else:
         # Log dataset selection
         if datasets_to_use is not None and len(datasets_to_use) > 0:
+            logger.info(f"Using subset of datasets: {len(self.dataset_names)}/{len(ALL_DATASETS)} datasets")
             logger.info(f"Selected datasets: {self.dataset_names}")
         else:
+            logger.info(f"Using all available datasets: {len(self.dataset_names)} datasets")
         self.terms = self.TERMS
         self.mode = mode
                     )
                     self.datasets[dataset_key] = dataset
+                    self.dataset_prediction_lengths[dataset_key] = dataset.prediction_length
                     logger.info(
                         f"Loaded {dataset_key} - prediction_length: {dataset.prediction_length}, "
             target_np = np.asarray(target, dtype=np.float32)
             return np.isnan(target_np).any()
         except Exception:
+            logger.warning("NaN check: failed to coerce target to float32; skipping entry")
             return True
     def _convert_to_container(
+        self, data_entries: list[dict], prediction_length: int, dataset_freq: str
     ) -> BatchTimeSeriesContainer:
         """Convert a batch of data entries to BatchTimeSeriesContainer format with fixed future length."""
         batch_size = len(data_entries)
             _, seq_len = target.shape
             # Only consider up to the last (max_context_length) values
+            effective_max_context = self.max_context_length if self.max_context_length is not None else seq_len
             if seq_len > effective_max_context:
                 seq_len = effective_max_context
             # History is up to (max_context_length - prediction_length)
+            history_len = max(0, min(seq_len, effective_max_context) - prediction_length)
             max_history_len = max(max_history_len, history_len)
         # Get number of channels from first entry
         num_channels = first_target.shape[0]
         # Allocate arrays
+        history_values = np.full((batch_size, max_history_len, num_channels), np.nan, dtype=np.float32)
+        future_values = np.full((batch_size, prediction_length, num_channels), np.nan, dtype=np.float32)
         history_mask = np.zeros((batch_size, max_history_len), dtype=bool)
         # Second pass: fill arrays
             # Truncate to last effective_max_context points if needed
             full_seq_len = target.shape[1]
+            total_len_allowed = self.max_context_length if self.max_context_length is not None else full_seq_len
             total_len_for_entry = min(full_seq_len, total_len_allowed)
             if total_len_for_entry < prediction_length + 1:
                 # Not enough length to build (history + future). Signal to caller.
+                raise ValueError("Entry too short after max_context_length truncation to form history+future window")
             truncated = target[:, -total_len_for_entry:]
             cur_history_len = total_len_for_entry - prediction_length
             hist = truncated[:, :cur_history_len]  # [C, H]
+            fut = truncated[:, cur_history_len : cur_history_len + prediction_length]  # [C, P]
             # Write into batch arrays with time last -> transpose to [H, C] / [P, C]
             history_values[i, :cur_history_len, :] = hist.T
             future_values=torch.tensor(future_values, dtype=torch.float32),
             start=start_list,
             frequency=frequency_list,
+            history_mask=torch.tensor(history_mask, dtype=torch.bool) if self.mode == "train" else None,
         )
     def _prepare_epoch_data(self) -> None:
                 for i in range(0, len(valid_entries), self.batch_size):
                     batch_entries = valid_entries[i : i + self.batch_size]
                     try:
+                        batch_container = self._convert_to_container(batch_entries, prediction_length, dataset_freq)
                         self._epoch_data.append((dataset_key, batch_container))
                     except Exception as e:
+                        logger.warning(f"Failed to create batch for {dataset_key}: {str(e)}")
                         continue
             except Exception as e:
     base_data_dir: str,
     batch_size: int = 128,
     num_batches_per_epoch: int = 1000,
+    generator_proportions: dict[str, float] | None = None,
     mixed_batches: bool = True,
+    augmentations: dict[str, bool] | None = None,
+    augmentation_probabilities: dict[str, float] | None = None,
+    device: torch.device | None = None,
     num_workers: int = 0,
     pin_memory: bool = True,
     global_seed: int = 42,
+    nan_stats_path: str | None = None,
+    nan_patterns_path: str | None = None,
+    chosen_scaler_name: str | None = None,
 ) -> torch.utils.data.DataLoader:
     """
     Create a PyTorch DataLoader for training with saved generator batches.
         batch_size: int = 128,
         num_batches: int = 2,
         future_length: int = 512,
+        generator_proportions: dict[str, float] | None = None,
+        augmentations: dict[str, bool] | None = None,
+        augmentation_probabilities: dict[str, float] | None = None,
+        device: torch.device | None = None,
         global_seed: int = 42,
+        chosen_scaler_name: str | None = None,
+        nan_stats_path: str | None = None,
+        nan_patterns_path: str | None = None,
         rank: int = 0,
         world_size: int = 1,
     ):
             batch, _ = self.batch_composer.create_batch(
                 batch_size=batch_size,
                 future_length=future_length,
+                seed=global_seed + 999999 + i,  # Fixed seeds for reproducible validation
             )
             self.validation_batches.append(batch)
+        logger.info(f"Created {num_batches} fixed validation batches with batch_size={batch_size}")
     def __len__(self) -> int:
         return self.num_batches
     base_data_dir: str,
     batch_size: int = 128,
     num_batches_per_epoch: int = 1000,
+    generator_proportions: dict[str, float] | None = None,
     mixed_batches: bool = True,
+    augmentations: dict[str, bool] | None = None,
+    augmentation_probabilities: dict[str, float] | None = None,
     global_seed: int = 42,
+    nan_stats_path: str | None = None,
+    nan_patterns_path: str | None = None,
+    chosen_scaler_name: str | None = None,
     rank: int = 0,
     world_size: int = 1,
 ) -> ComposedDataset:
         f"batch_size={batch_size}, mixed_batches={mixed_batches}"
     )
+    return dataset

src/data/scalers.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Optional
 import torch
@@ -14,26 +13,22 @@ class BaseScaler(ABC):
     @abstractmethod
     def compute_statistics(
-        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
         """
         Compute scaling statistics from historical data.
         """
         pass
     @abstractmethod
-    def scale(
-        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply scaling transformation to data.
         """
         pass
     @abstractmethod
-    def inverse_scale(
-        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply inverse scaling transformation to recover original scale.
         """
@@ -54,8 +49,8 @@ class RobustScaler(BaseScaler):
         self.min_scale = min_scale
     def compute_statistics(
-        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
         """
         Compute median and IQR statistics from historical data with improved numerical stability.
         """
@@ -91,49 +86,37 @@ class RobustScaler(BaseScaler):
                         q75 = torch.quantile(valid_data, 0.75)
                         q25 = torch.quantile(valid_data, 0.25)
                         iqr_val = q75 - q25
-                        iqr_val = torch.max(
-                            iqr_val, torch.tensor(self.min_scale, device=device)
-                        )
                         iqrs[b, 0, c] = iqr_val
                     except Exception:
                         std_val = torch.std(valid_data)
-                        iqrs[b, 0, c] = torch.max(
-                            std_val, torch.tensor(self.min_scale, device=device)
-                        )
                 else:
                     iqrs[b, 0, c] = self.min_scale
         return {"median": medians, "iqr": iqrs}
-    def scale(
-        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply robust scaling: (data - median) / (iqr + epsilon).
         """
         median = statistics["median"]
         iqr = statistics["iqr"]
-        denominator = torch.max(
-            iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device)
-        )
         scaled_data = (data - median) / denominator
         scaled_data = torch.clamp(scaled_data, -50.0, 50.0)
         return scaled_data
-    def inverse_scale(
-        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply inverse robust scaling, now compatible with 3D or 4D tensors.
         """
         median = statistics["median"]
         iqr = statistics["iqr"]
-        denominator = torch.max(
-            iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device)
-        )
         if scaled_data.ndim == 4:
             denominator = denominator.unsqueeze(-1)
@@ -153,8 +136,8 @@ class MinMaxScaler(BaseScaler):
         self.epsilon = epsilon
     def compute_statistics(
-        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
         """
         Compute min and max statistics from historical data.
         """
@@ -188,9 +171,7 @@ class MinMaxScaler(BaseScaler):
         return {"min": mins, "max": maxs}
-    def scale(
-        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply min-max scaling to range [-1, 1].
         """
@@ -200,9 +181,7 @@ class MinMaxScaler(BaseScaler):
         normalized = (data - min_val) / (max_val - min_val + self.epsilon)
         return normalized * 2.0 - 1.0
-    def inverse_scale(
-        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply inverse min-max scaling, now compatible with 3D or 4D tensors.
         """
@@ -225,8 +204,8 @@ class MeanScaler(BaseScaler):
     """
     def compute_statistics(
-            self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
         """
         Compute the mean for each channel from historical data.
         """
@@ -262,18 +241,14 @@ class MeanScaler(BaseScaler):
         return {"mean": means}
-    def scale(
-            self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply mean centering: data - mean.
         """
         mean = statistics["mean"]
         return data - mean
-    def inverse_scale(
-            self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply inverse mean centering: scaled_data + mean.
@@ -297,8 +272,8 @@ class MedianScaler(BaseScaler):
     """
     def compute_statistics(
-            self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
         """
         Compute the median for each channel from historical data.
         """
@@ -334,18 +309,14 @@ class MedianScaler(BaseScaler):
         return {"median": medians}
-    def scale(
-            self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply median centering: data - median.
         """
         median = statistics["median"]
         return data - median
-    def inverse_scale(
-            self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
-    ) -> torch.Tensor:
         """
         Apply inverse median centering: scaled_data + median.

 from abc import ABC, abstractmethod
 import torch
     @abstractmethod
     def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor | None = None
+    ) -> dict[str, torch.Tensor]:
         """
         Compute scaling statistics from historical data.
         """
         pass
     @abstractmethod
+    def scale(self, data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply scaling transformation to data.
         """
         pass
     @abstractmethod
+    def inverse_scale(self, scaled_data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply inverse scaling transformation to recover original scale.
         """
         self.min_scale = min_scale
     def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor | None = None
+    ) -> dict[str, torch.Tensor]:
         """
         Compute median and IQR statistics from historical data with improved numerical stability.
         """
                         q75 = torch.quantile(valid_data, 0.75)
                         q25 = torch.quantile(valid_data, 0.25)
                         iqr_val = q75 - q25
+                        iqr_val = torch.max(iqr_val, torch.tensor(self.min_scale, device=device))
                         iqrs[b, 0, c] = iqr_val
                     except Exception:
                         std_val = torch.std(valid_data)
+                        iqrs[b, 0, c] = torch.max(std_val, torch.tensor(self.min_scale, device=device))
                 else:
                     iqrs[b, 0, c] = self.min_scale
         return {"median": medians, "iqr": iqrs}
+    def scale(self, data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply robust scaling: (data - median) / (iqr + epsilon).
         """
         median = statistics["median"]
         iqr = statistics["iqr"]
+        denominator = torch.max(iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device))
         scaled_data = (data - median) / denominator
         scaled_data = torch.clamp(scaled_data, -50.0, 50.0)
         return scaled_data
+    def inverse_scale(self, scaled_data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply inverse robust scaling, now compatible with 3D or 4D tensors.
         """
         median = statistics["median"]
         iqr = statistics["iqr"]
+        denominator = torch.max(iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device))
         if scaled_data.ndim == 4:
             denominator = denominator.unsqueeze(-1)
         self.epsilon = epsilon
     def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor | None = None
+    ) -> dict[str, torch.Tensor]:
         """
         Compute min and max statistics from historical data.
         """
         return {"min": mins, "max": maxs}
+    def scale(self, data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply min-max scaling to range [-1, 1].
         """
         normalized = (data - min_val) / (max_val - min_val + self.epsilon)
         return normalized * 2.0 - 1.0
+    def inverse_scale(self, scaled_data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply inverse min-max scaling, now compatible with 3D or 4D tensors.
         """
     """
     def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor | None = None
+    ) -> dict[str, torch.Tensor]:
         """
         Compute the mean for each channel from historical data.
         """
         return {"mean": means}
+    def scale(self, data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply mean centering: data - mean.
         """
         mean = statistics["mean"]
         return data - mean
+    def inverse_scale(self, scaled_data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply inverse mean centering: scaled_data + mean.
     """
     def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor | None = None
+    ) -> dict[str, torch.Tensor]:
         """
         Compute the median for each channel from historical data.
         """
         return {"median": medians}
+    def scale(self, data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply median centering: data - median.
         """
         median = statistics["median"]
         return data - median
+    def inverse_scale(self, scaled_data: torch.Tensor, statistics: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Apply inverse median centering: scaled_data + median.

src/data/time_features.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional
 import numpy as np
 import pandas as pd
@@ -52,9 +52,7 @@ from src.data.frequency import (
 from src.utils.utils import device
 # Configure logging
-logging.basicConfig(
-    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
-)
 logger = logging.getLogger(__name__)
@@ -193,9 +191,7 @@ class TimeFeatureGenerator:
         self.holiday_feature_set = None
         if use_holiday_features and holiday_set in HOLIDAY_FEATURE_SETS:
             kernel_func = self._get_holiday_kernel(holiday_kernel, holiday_kernel_alpha)
-            self.holiday_feature_set = SpecialDateFeatureSet(
-                HOLIDAY_FEATURE_SETS[holiday_set], kernel_func
-            )
     def _get_holiday_kernel(self, kernel_type: str, alpha: float):
         """Get holiday kernel function."""
@@ -216,9 +212,7 @@ class TimeFeatureGenerator:
         else:
             return "low_freq"
-    def _compute_enhanced_features(
-        self, period_index: pd.PeriodIndex, freq_str: str
-    ) -> np.ndarray:
         """Compute enhanced time features based on frequency."""
         if not self.use_enhanced_features:
             return np.array([]).reshape(len(period_index), 0)
@@ -318,9 +312,7 @@ class TimeFeatureGenerator:
                 return []
             # Sort by magnitude and take top periods
-            sorted_indices = peak_indices[
-                np.argsort(fft_magnitudes[peak_indices])[::-1]
-            ]
             top_indices = sorted_indices[: self.max_seasonal_periods]
             # Convert frequencies to periods
@@ -410,9 +402,7 @@ class TimeFeatureGenerator:
         try:
             standard_features = time_features_from_frequency_str(freq_str)
             if standard_features:
-                std_feat = np.stack(
-                    [feat(period_index) for feat in standard_features], axis=-1
-                )
                 all_features.append(std_feat)
         except Exception:
             pass
@@ -428,9 +418,7 @@ class TimeFeatureGenerator:
             all_features.append(holiday_feat)
         # Seasonality features (including auto-detected)
-        seasonality_feat = self._compute_seasonality_features(
-            period_index, freq_str, time_series_values
-        )
         if seasonality_feat.shape[1] > 0:
             all_features.append(seasonality_feat)
@@ -443,13 +431,13 @@ class TimeFeatureGenerator:
 def compute_batch_time_features(
-    start: List[np.datetime64],
     history_length: int,
     future_length: int,
     batch_size: int,
-    frequency: List[Frequency],
     K_max: int = 6,
-    time_feature_config: Optional[Dict[str, Any]] = None,
 ):
     """
     Compute time features from start timestamps and frequency.
@@ -500,37 +488,25 @@ def compute_batch_time_features(
             start_ts = BASE_START_DATE
         # Create history range with bounds checking
-        history_range = pd.date_range(
-            start=start_ts, periods=history_length, freq=freq_str
-        )
         # Check if history range goes beyond safe bounds
         if history_range[-1] > BASE_END_DATE:
-            safe_start = BASE_END_DATE - pd.tseries.frequencies.to_offset(freq_str) * (
-                history_length + future_length
-            )
             if safe_start < BASE_START_DATE:
                 safe_start = BASE_START_DATE
-            history_range = pd.date_range(
-                start=safe_start, periods=history_length, freq=freq_str
-            )
         future_start = history_range[-1] + pd.tseries.frequencies.to_offset(freq_str)
-        future_range = pd.date_range(
-            start=future_start, periods=future_length, freq=freq_str
-        )
         # Convert to period indices
         history_period_idx = history_range.to_period(period_freq_str)
         future_period_idx = future_range.to_period(period_freq_str)
         # Compute enhanced features
-        history_features = feature_generator.compute_features(
-            history_period_idx, history_range, freq_str
-        )
-        future_features = feature_generator.compute_features(
-            future_period_idx, future_range, freq_str
-        )
         # Pad or truncate to K_max
         history_features = _pad_or_truncate_features(history_features, K_max)

 import logging
+from typing import Any
 import numpy as np
 import pandas as pd
 from src.utils.utils import device
 # Configure logging
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
         self.holiday_feature_set = None
         if use_holiday_features and holiday_set in HOLIDAY_FEATURE_SETS:
             kernel_func = self._get_holiday_kernel(holiday_kernel, holiday_kernel_alpha)
+            self.holiday_feature_set = SpecialDateFeatureSet(HOLIDAY_FEATURE_SETS[holiday_set], kernel_func)
     def _get_holiday_kernel(self, kernel_type: str, alpha: float):
         """Get holiday kernel function."""
         else:
             return "low_freq"
+    def _compute_enhanced_features(self, period_index: pd.PeriodIndex, freq_str: str) -> np.ndarray:
         """Compute enhanced time features based on frequency."""
         if not self.use_enhanced_features:
             return np.array([]).reshape(len(period_index), 0)
                 return []
             # Sort by magnitude and take top periods
+            sorted_indices = peak_indices[np.argsort(fft_magnitudes[peak_indices])[::-1]]
             top_indices = sorted_indices[: self.max_seasonal_periods]
             # Convert frequencies to periods
         try:
             standard_features = time_features_from_frequency_str(freq_str)
             if standard_features:
+                std_feat = np.stack([feat(period_index) for feat in standard_features], axis=-1)
                 all_features.append(std_feat)
         except Exception:
             pass
             all_features.append(holiday_feat)
         # Seasonality features (including auto-detected)
+        seasonality_feat = self._compute_seasonality_features(period_index, freq_str, time_series_values)
         if seasonality_feat.shape[1] > 0:
             all_features.append(seasonality_feat)
 def compute_batch_time_features(
+    start: list[np.datetime64],
     history_length: int,
     future_length: int,
     batch_size: int,
+    frequency: list[Frequency],
     K_max: int = 6,
+    time_feature_config: dict[str, Any] | None = None,
 ):
     """
     Compute time features from start timestamps and frequency.
             start_ts = BASE_START_DATE
         # Create history range with bounds checking
+        history_range = pd.date_range(start=start_ts, periods=history_length, freq=freq_str)
         # Check if history range goes beyond safe bounds
         if history_range[-1] > BASE_END_DATE:
+            safe_start = BASE_END_DATE - pd.tseries.frequencies.to_offset(freq_str) * (history_length + future_length)
             if safe_start < BASE_START_DATE:
                 safe_start = BASE_START_DATE
+            history_range = pd.date_range(start=safe_start, periods=history_length, freq=freq_str)
         future_start = history_range[-1] + pd.tseries.frequencies.to_offset(freq_str)
+        future_range = pd.date_range(start=future_start, periods=future_length, freq=freq_str)
         # Convert to period indices
         history_period_idx = history_range.to_period(period_freq_str)
         future_period_idx = future_range.to_period(period_freq_str)
         # Compute enhanced features
+        history_features = feature_generator.compute_features(history_period_idx, history_range, freq_str)
+        future_features = feature_generator.compute_features(future_period_idx, future_range, freq_str)
         # Pad or truncate to K_max
         history_features = _pad_or_truncate_features(history_features, K_max)

src/data/utils.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import random
-from typing import Optional, Tuple, Union
 def sample_future_length(
-    range: Union[Tuple[int, int], str] = "gift_eval",
-    total_length: Optional[int] = None,
 ) -> int:
     """
     Sample a forecast length.
@@ -16,7 +15,7 @@ def sample_future_length(
       floor(0.45 * total_length) before sampling.
     """
     # Compute the cap when total_length is provided
-    cap: Optional[int] = None
     if total_length is not None:
         cap = max(1, int(0.45 * int(total_length)))
@@ -62,11 +61,11 @@ def sample_future_length(
         if cap is not None:
             filtered = [
                 (length_candidate, weight)
-                for length_candidate, weight in zip(lengths, weights)
                 if length_candidate <= cap
             ]
             if filtered:
-                lengths, weights = zip(*filtered)
                 lengths = list(lengths)
                 weights = list(weights)

 import random
 def sample_future_length(
+    range: tuple[int, int] | str = "gift_eval",
+    total_length: int | None = None,
 ) -> int:
     """
     Sample a forecast length.
       floor(0.45 * total_length) before sampling.
     """
     # Compute the cap when total_length is provided
+    cap: int | None = None
     if total_length is not None:
         cap = max(1, int(0.45 * int(total_length)))
         if cap is not None:
             filtered = [
                 (length_candidate, weight)
+                for length_candidate, weight in zip(lengths, weights, strict=True)
                 if length_candidate <= cap
             ]
             if filtered:
+                lengths, weights = zip(*filtered, strict=True)
                 lengths = list(lengths)
                 weights = list(weights)

src/gift_eval/__init__.py CHANGED Viewed

@@ -2,7 +2,11 @@
 from .core import DatasetMetadata, EvaluationItem, expand_datasets_arg
 from .predictor import TimeSeriesPredictor
-from .results import aggregate_results, get_all_datasets_full_name, write_results_to_disk
 __all__ = [
     "DatasetMetadata",

 from .core import DatasetMetadata, EvaluationItem, expand_datasets_arg
 from .predictor import TimeSeriesPredictor
+from .results import (
+    aggregate_results,
+    get_all_datasets_full_name,
+    write_results_to_disk,
+)
 __all__ = [
     "DatasetMetadata",

src/gift_eval/constants.py CHANGED Viewed

@@ -16,7 +16,6 @@ from gluonts.ev.metrics import (
     MeanWeightedSumQuantileLoss,
 )
 logger = logging.getLogger(__name__)
@@ -30,7 +29,7 @@ DATASET_PROPERTIES_PATH = _MODULE_DIR / "data" / "dataset_properties.json"
 try:
-    with open(DATASET_PROPERTIES_PATH, "r") as f:
         DATASET_PROPERTIES = json.load(f)
 except Exception as exc:  # pragma: no cover - logging path
     DATASET_PROPERTIES = {}
@@ -152,9 +151,7 @@ METRICS = (
     RMSE(),
     NRMSE(),
     ND(),
-    MeanWeightedSumQuantileLoss(
-        quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-    ),
 )

     MeanWeightedSumQuantileLoss,
 )
 logger = logging.getLogger(__name__)
 try:
+    with open(DATASET_PROPERTIES_PATH) as f:
         DATASET_PROPERTIES = json.load(f)
 except Exception as exc:  # pragma: no cover - logging path
     DATASET_PROPERTIES = {}
     RMSE(),
     NRMSE(),
     ND(),
+    MeanWeightedSumQuantileLoss(quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
 )

src/gift_eval/core.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Core data structures and helpers shared across GIFT-Eval modules."""
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
 from src.gift_eval.constants import ALL_DATASETS
@@ -26,14 +25,14 @@ class EvaluationItem:
     """Container for evaluation results and optional figures."""
     dataset_metadata: DatasetMetadata
-    metrics: Dict
-    figures: List[Tuple[object, str]]
-DatasetSelection = Union[List[str], Tuple[str, ...], str]
-def expand_datasets_arg(datasets: DatasetSelection) -> List[str]:
     """Normalize dataset selection strings to explicit lists."""
     if isinstance(datasets, str):
@@ -60,5 +59,3 @@ __all__ = [
     "DatasetSelection",
     "expand_datasets_arg",
 ]

 """Core data structures and helpers shared across GIFT-Eval modules."""
 from dataclasses import dataclass
 from src.gift_eval.constants import ALL_DATASETS
     """Container for evaluation results and optional figures."""
     dataset_metadata: DatasetMetadata
+    metrics: dict
+    figures: list[tuple[object, str]]
+DatasetSelection = list[str] | tuple[str, ...] | str
+def expand_datasets_arg(datasets: DatasetSelection) -> list[str]:
     """Normalize dataset selection strings to explicit lists."""
     if isinstance(datasets, str):
     "DatasetSelection",
     "expand_datasets_arg",
 ]

src/gift_eval/data.py CHANGED Viewed

@@ -18,7 +18,6 @@ from collections.abc import Iterable, Iterator
 from enum import Enum
 from functools import cached_property
 from pathlib import Path
-from typing import Optional
 import datasets
 import pyarrow.compute as pc
@@ -97,9 +96,7 @@ class MultivariateToUnivariate(Transformation):
     def __init__(self, field):
         self.field = field
-    def __call__(
-        self, data_it: Iterable[DataEntry], is_train: bool = False
-    ) -> Iterator:
         for data_entry in data_it:
             item_id = data_entry["item_id"]
             val_ls = list(data_entry[self.field])
@@ -117,12 +114,10 @@ class Dataset:
         term: Term | str = Term.SHORT,
         to_univariate: bool = False,
         storage_path: str = None,
-        max_windows: Optional[int] = None,
     ):
         storage_path = Path(storage_path)
-        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format(
-            "numpy"
-        )
         process = ProcessDataEntry(
             self.freq,
             one_dim_target=self.target_dim == 1,
@@ -130,9 +125,7 @@ class Dataset:
         self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)
         if to_univariate:
-            self.gluonts_dataset = MultivariateToUnivariate("target").apply(
-                self.gluonts_dataset
-            )
         self.term = Term(term)
         self.name = name
@@ -143,9 +136,7 @@ class Dataset:
         freq = norm_freq_str(to_offset(self.freq).name)
         if freq.endswith("E"):
             freq = freq[:-1]
-        pred_len = (
-            M4_PRED_LENGTH_MAP[freq] if "m4" in self.name else PRED_LENGTH_MAP[freq]
-        )
         return self.term.multiplier * pred_len
     @cached_property
@@ -154,26 +145,13 @@ class Dataset:
     @cached_property
     def target_dim(self) -> int:
-        return (
-            target.shape[0]
-            if len((target := self.hf_dataset[0]["target"]).shape) > 1
-            else 1
-        )
     @cached_property
     def past_feat_dynamic_real_dim(self) -> int:
         if "past_feat_dynamic_real" not in self.hf_dataset[0]:
             return 0
-        elif (
-            len(
-                (
-                    past_feat_dynamic_real := self.hf_dataset[0][
-                        "past_feat_dynamic_real"
-                    ]
-                ).shape
-            )
-            > 1
-        ):
             return past_feat_dynamic_real.shape[0]
         else:
             return 1
@@ -188,11 +166,7 @@ class Dataset:
     @cached_property
     def _min_series_length(self) -> int:
         if self.hf_dataset[0]["target"].ndim > 1:
-            lengths = pc.list_value_length(
-                pc.list_flatten(
-                    pc.list_slice(self.hf_dataset.data.column("target"), 0, 1)
-                )
-            )
         else:
             lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
         return min(lengths.to_numpy())
@@ -200,32 +174,24 @@ class Dataset:
     @cached_property
     def sum_series_length(self) -> int:
         if self.hf_dataset[0]["target"].ndim > 1:
-            lengths = pc.list_value_length(
-                pc.list_flatten(self.hf_dataset.data.column("target"))
-            )
         else:
             lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
         return sum(lengths.to_numpy())
     @property
     def training_dataset(self) -> TrainingDataset:
-        training_dataset, _ = split(
-            self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1)
-        )
         return training_dataset
     @property
     def validation_dataset(self) -> TrainingDataset:
-        validation_dataset, _ = split(
-            self.gluonts_dataset, offset=-self.prediction_length * self.windows
-        )
         return validation_dataset
     @property
     def test_data(self) -> TestData:
-        _, test_template = split(
-            self.gluonts_dataset, offset=-self.prediction_length * self.windows
-        )
         test_data = test_template.generate_instances(
             prediction_length=self.prediction_length,
             windows=self.windows,

 from enum import Enum
 from functools import cached_property
 from pathlib import Path
 import datasets
 import pyarrow.compute as pc
     def __init__(self, field):
         self.field = field
+    def __call__(self, data_it: Iterable[DataEntry], is_train: bool = False) -> Iterator:
         for data_entry in data_it:
             item_id = data_entry["item_id"]
             val_ls = list(data_entry[self.field])
         term: Term | str = Term.SHORT,
         to_univariate: bool = False,
         storage_path: str = None,
+        max_windows: int | None = None,
     ):
         storage_path = Path(storage_path)
+        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format("numpy")
         process = ProcessDataEntry(
             self.freq,
             one_dim_target=self.target_dim == 1,
         self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)
         if to_univariate:
+            self.gluonts_dataset = MultivariateToUnivariate("target").apply(self.gluonts_dataset)
         self.term = Term(term)
         self.name = name
         freq = norm_freq_str(to_offset(self.freq).name)
         if freq.endswith("E"):
             freq = freq[:-1]
+        pred_len = M4_PRED_LENGTH_MAP[freq] if "m4" in self.name else PRED_LENGTH_MAP[freq]
         return self.term.multiplier * pred_len
     @cached_property
     @cached_property
     def target_dim(self) -> int:
+        return target.shape[0] if len((target := self.hf_dataset[0]["target"]).shape) > 1 else 1
     @cached_property
     def past_feat_dynamic_real_dim(self) -> int:
         if "past_feat_dynamic_real" not in self.hf_dataset[0]:
             return 0
+        elif len((past_feat_dynamic_real := self.hf_dataset[0]["past_feat_dynamic_real"]).shape) > 1:
             return past_feat_dynamic_real.shape[0]
         else:
             return 1
     @cached_property
     def _min_series_length(self) -> int:
         if self.hf_dataset[0]["target"].ndim > 1:
+            lengths = pc.list_value_length(pc.list_flatten(pc.list_slice(self.hf_dataset.data.column("target"), 0, 1)))
         else:
             lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
         return min(lengths.to_numpy())
     @cached_property
     def sum_series_length(self) -> int:
         if self.hf_dataset[0]["target"].ndim > 1:
+            lengths = pc.list_value_length(pc.list_flatten(self.hf_dataset.data.column("target")))
         else:
             lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
         return sum(lengths.to_numpy())
     @property
     def training_dataset(self) -> TrainingDataset:
+        training_dataset, _ = split(self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1))
         return training_dataset
     @property
     def validation_dataset(self) -> TrainingDataset:
+        validation_dataset, _ = split(self.gluonts_dataset, offset=-self.prediction_length * self.windows)
         return validation_dataset
     @property
     def test_data(self) -> TestData:
+        _, test_template = split(self.gluonts_dataset, offset=-self.prediction_length * self.windows)
         test_data = test_template.generate_instances(
             prediction_length=self.prediction_length,
             windows=self.windows,

src/gift_eval/evaluate.py CHANGED Viewed

@@ -2,7 +2,6 @@ import argparse
 import logging
 import warnings
 from pathlib import Path
-from typing import List, Optional, Tuple
 import matplotlib
 from gluonts.model.evaluation import evaluate_model
@@ -44,19 +43,20 @@ class WarningFilter(logging.Filter):
 # Filter out gluonts warnings about mean predictions
 gts_logger = logging.getLogger("gluonts.model.forecast")
-gts_logger.addFilter(
-    WarningFilter("The mean prediction is not stored in the forecast data")
-)
 def construct_evaluation_data(
     dataset_name: str,
     dataset_storage_path: str,
-    terms: List[str] = ["short", "medium", "long"],
-    max_windows: Optional[int] = None,
-) -> List[Tuple[Dataset, DatasetMetadata]]:
     """Build datasets and rich metadata per term for a dataset name."""
-    sub_datasets: List[Tuple[Dataset, DatasetMetadata]] = []
     if "/" in dataset_name:
         ds_key, ds_freq = dataset_name.split("/")
@@ -69,9 +69,7 @@ def construct_evaluation_data(
     for term in terms:
         # Skip medium/long terms for datasets that don't support them
-        if (
-            term == "medium" or term == "long"
-        ) and dataset_name not in MED_LONG_DATASETS:
             continue
         # Probe once to determine dimensionality
@@ -96,7 +94,7 @@ def construct_evaluation_data(
         # Compute metadata
         season_length = get_seasonality(dataset.freq)
         actual_freq = ds_freq if ds_freq else dataset.freq
         metadata = DatasetMetadata(
             full_name=f"{ds_key}/{actual_freq}/{term}",
             key=ds_key,
@@ -118,14 +116,17 @@ def evaluate_datasets(
     predictor: TimeSeriesPredictor,
     dataset: str,
     dataset_storage_path: str,
-    terms: List[str] = ["short", "medium", "long"],
-    max_windows: Optional[int] = None,
     batch_size: int = 48,
-    max_context_length: Optional[int] = 1024,
     create_plots: bool = False,
     max_plots_per_dataset: int = 10,
-) -> List[EvaluationItem]:
     """Evaluate predictor on one dataset across the requested terms."""
     sub_datasets = construct_evaluation_data(
         dataset_name=dataset,
         dataset_storage_path=dataset_storage_path,
@@ -133,7 +134,7 @@ def evaluate_datasets(
         max_windows=max_windows,
     )
-    results: List[EvaluationItem] = []
     for i, (sub_dataset, metadata) in enumerate(sub_datasets):
         logger.info(f"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}")
         logger.info(f"  Dataset size: {len(sub_dataset.test_data)}")
@@ -161,7 +162,7 @@ def evaluate_datasets(
             seasonality=metadata.season_length,
         )
-        figs: List[Tuple[object, str]] = []
         if create_plots:
             forecasts = predictor.predict(sub_dataset.test_data.input)
             figs = create_plots_for_dataset(
@@ -172,21 +173,19 @@ def evaluate_datasets(
                 max_context_length=max_context_length,
             )
-        results.append(
-            EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs)
-        )
     return results
 def _run_evaluation(
     predictor: TimeSeriesPredictor,
-    datasets: List[str] | str,
-    terms: List[str],
     dataset_storage_path: str,
-    max_windows: Optional[int] = None,
     batch_size: int = 48,
-    max_context_length: Optional[int] = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
@@ -220,12 +219,12 @@ def _run_evaluation(
 def evaluate_from_paths(
     model_path: str,
     config_path: str,
-    datasets: List[str] | str,
-    terms: List[str],
     dataset_storage_path: str,
-    max_windows: Optional[int] = None,
     batch_size: int = 48,
-    max_context_length: Optional[int] = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
@@ -265,12 +264,12 @@ def evaluate_from_paths(
 def evaluate_in_memory(
     model,
     config: dict,
-    datasets: List[str] | str,
-    terms: List[str],
     dataset_storage_path: str,
-    max_windows: Optional[int] = None,
     batch_size: int = 48,
-    max_context_length: Optional[int] = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
@@ -302,9 +301,7 @@ def evaluate_in_memory(
 def _parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Evaluate TimeSeriesModel on GIFT-Eval datasets"
-    )
     # Model configuration
     parser.add_argument(
@@ -353,9 +350,7 @@ def _parse_args() -> argparse.Namespace:
     )
     # Inference configuration
-    parser.add_argument(
-        "--batch_size", type=int, default=48, help="Batch size for model inference"
-    )
     parser.add_argument(
         "--max_context_length",
         type=int,

 import logging
 import warnings
 from pathlib import Path
 import matplotlib
 from gluonts.model.evaluation import evaluate_model
 # Filter out gluonts warnings about mean predictions
 gts_logger = logging.getLogger("gluonts.model.forecast")
+gts_logger.addFilter(WarningFilter("The mean prediction is not stored in the forecast data"))
 def construct_evaluation_data(
     dataset_name: str,
     dataset_storage_path: str,
+    terms: list[str] | None = None,
+    max_windows: int | None = None,
+) -> list[tuple[Dataset, DatasetMetadata]]:
     """Build datasets and rich metadata per term for a dataset name."""
+    if terms is None:
+        terms = ["short", "medium", "long"]
+    sub_datasets: list[tuple[Dataset, DatasetMetadata]] = []
     if "/" in dataset_name:
         ds_key, ds_freq = dataset_name.split("/")
     for term in terms:
         # Skip medium/long terms for datasets that don't support them
+        if (term == "medium" or term == "long") and dataset_name not in MED_LONG_DATASETS:
             continue
         # Probe once to determine dimensionality
         # Compute metadata
         season_length = get_seasonality(dataset.freq)
         actual_freq = ds_freq if ds_freq else dataset.freq
         metadata = DatasetMetadata(
             full_name=f"{ds_key}/{actual_freq}/{term}",
             key=ds_key,
     predictor: TimeSeriesPredictor,
     dataset: str,
     dataset_storage_path: str,
+    terms: list[str] | None = None,
+    max_windows: int | None = None,
     batch_size: int = 48,
+    max_context_length: int | None = 1024,
     create_plots: bool = False,
     max_plots_per_dataset: int = 10,
+) -> list[EvaluationItem]:
     """Evaluate predictor on one dataset across the requested terms."""
+    if terms is None:
+        terms = ["short", "medium", "long"]
     sub_datasets = construct_evaluation_data(
         dataset_name=dataset,
         dataset_storage_path=dataset_storage_path,
         max_windows=max_windows,
     )
+    results: list[EvaluationItem] = []
     for i, (sub_dataset, metadata) in enumerate(sub_datasets):
         logger.info(f"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}")
         logger.info(f"  Dataset size: {len(sub_dataset.test_data)}")
             seasonality=metadata.season_length,
         )
+        figs: list[tuple[object, str]] = []
         if create_plots:
             forecasts = predictor.predict(sub_dataset.test_data.input)
             figs = create_plots_for_dataset(
                 max_context_length=max_context_length,
             )
+        results.append(EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs))
     return results
 def _run_evaluation(
     predictor: TimeSeriesPredictor,
+    datasets: list[str] | str,
+    terms: list[str],
     dataset_storage_path: str,
+    max_windows: int | None = None,
     batch_size: int = 48,
+    max_context_length: int | None = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
 def evaluate_from_paths(
     model_path: str,
     config_path: str,
+    datasets: list[str] | str,
+    terms: list[str],
     dataset_storage_path: str,
+    max_windows: int | None = None,
     batch_size: int = 48,
+    max_context_length: int | None = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
 def evaluate_in_memory(
     model,
     config: dict,
+    datasets: list[str] | str,
+    terms: list[str],
     dataset_storage_path: str,
+    max_windows: int | None = None,
     batch_size: int = 48,
+    max_context_length: int | None = 1024,
     output_dir: str = "gift_eval_results",
     model_name: str = "TimeSeriesModel",
     create_plots: bool = False,
 def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate TimeSeriesModel on GIFT-Eval datasets")
     # Model configuration
     parser.add_argument(
     )
     # Inference configuration
+    parser.add_argument("--batch_size", type=int, default=48, help="Batch size for model inference")
     parser.add_argument(
         "--max_context_length",
         type=int,

src/gift_eval/predictor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Predictor implementation wrapping the TimeSeriesModel for GIFT-Eval."""
 import logging
-from typing import Iterator, List, Optional
 import numpy as np
 import torch
@@ -16,7 +16,6 @@ from src.data.scalers import RobustScaler
 from src.models.model import TimeSeriesModel
 from src.utils.utils import device
 logger = logging.getLogger(__name__)
@@ -30,7 +29,7 @@ class TimeSeriesPredictor(Predictor):
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
-        max_context_length: Optional[int] = None,
         debug: bool = False,
     ) -> None:
         # Dataset-specific context (can be updated per dataset/term)
@@ -46,9 +45,7 @@ class TimeSeriesPredictor(Predictor):
         self.config = config
         # Initialize scaler (using same type as model)
-        scaler_type = self.config.get("TimeSeriesModel", {}).get(
-            "scaler", "custom_robust"
-        )
         epsilon = self.config.get("TimeSeriesModel", {}).get("epsilon", 1e-3)
         if scaler_type == "custom_robust":
             self.scaler = RobustScaler(epsilon=epsilon)
@@ -57,10 +54,10 @@ class TimeSeriesPredictor(Predictor):
     def set_dataset_context(
         self,
-        prediction_length: Optional[int] = None,
-        freq: Optional[str] = None,
-        batch_size: Optional[int] = None,
-        max_context_length: Optional[int] = None,
     ) -> None:
         """Update lightweight dataset-specific attributes without reloading the model."""
@@ -81,7 +78,7 @@ class TimeSeriesPredictor(Predictor):
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
-        max_context_length: Optional[int] = None,
         debug: bool = False,
     ) -> "TimeSeriesPredictor":
         return cls(
@@ -102,10 +99,10 @@ class TimeSeriesPredictor(Predictor):
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
-        max_context_length: Optional[int] = None,
         debug: bool = False,
     ) -> "TimeSeriesPredictor":
-        with open(config_path, "r") as f:
             config = yaml.safe_load(f)
         model = cls._load_model_from_path(config=config, model_path=model_path)
         return cls(
@@ -151,13 +148,13 @@ class TimeSeriesPredictor(Predictor):
                 seq_len = min(seq_len, self.max_context_length)
             return seq_len
-        length_to_items: dict[int, List[tuple[int, object]]] = {}
         for idx, entry in enumerate(test_data_input):
             seq_len = _effective_length(entry)
             length_to_items.setdefault(seq_len, []).append((idx, entry))
         total = len(test_data_input)
-        ordered_results: List[Optional[QuantileForecast]] = [None] * total
         for _, items in length_to_items.items():
             for i in range(0, len(items), self.batch_size):
@@ -169,7 +166,7 @@ class TimeSeriesPredictor(Predictor):
         return ordered_results  # type: ignore[return-value]
-    def _predict_batch(self, test_data_batch: List) -> List[QuantileForecast]:
         """Generate predictions for a batch of time series."""
         logger.debug(f"Processing batch of size: {len(test_data_batch)}")
@@ -191,9 +188,7 @@ class TimeSeriesPredictor(Predictor):
                 with torch.no_grad():
                     model_output = self.model(batch_container, drop_enc_allow=False)
-            forecasts = self._convert_to_forecasts(
-                model_output, test_data_batch, batch_container
-            )
             logger.debug(f"Generated {len(forecasts)} forecasts")
             return forecasts
@@ -201,9 +196,7 @@ class TimeSeriesPredictor(Predictor):
             logger.error(f"Error in batch prediction: {exc}")
             raise
-    def _convert_to_batch_container(
-        self, test_data_batch: List
-    ) -> BatchTimeSeriesContainer:
         """Convert gluonts test data to BatchTimeSeriesContainer."""
         batch_size = len(test_data_batch)
@@ -219,10 +212,7 @@ class TimeSeriesPredictor(Predictor):
             else:
                 target = target.T
-            if (
-                self.max_context_length is not None
-                and len(target) > self.max_context_length
-            ):
                 target = target[-self.max_context_length :]
             history_values_list.append(target)
@@ -232,9 +222,7 @@ class TimeSeriesPredictor(Predictor):
         history_values_np = np.stack(history_values_list, axis=0)
         num_channels = history_values_np.shape[2]
-        history_values = torch.tensor(
-            history_values_np, dtype=torch.float32, device=device
-        )
         future_values = torch.zeros(
             (batch_size, self.ds_prediction_length, num_channels),
@@ -252,28 +240,24 @@ class TimeSeriesPredictor(Predictor):
     def _convert_to_forecasts(
         self,
         model_output: dict,
-        test_data_batch: List,
         batch_container: BatchTimeSeriesContainer,
-    ) -> List[QuantileForecast]:
         """Convert model predictions to QuantileForecast objects."""
         predictions = model_output["result"]
         scale_statistics = model_output["scale_statistics"]
         if predictions.ndim == 4:
-            predictions_unscaled = self.scaler.inverse_scale(
-                predictions, scale_statistics
-            )
             is_quantile = True
             quantile_levels = self.model.quantiles
         else:
-            predictions_unscaled = self.scaler.inverse_scale(
-                predictions, scale_statistics
-            )
             is_quantile = False
             quantile_levels = [0.5]
-        forecasts: List[QuantileForecast] = []
         for idx, entry in enumerate(test_data_batch):
             history_length = int(batch_container.history_values.shape[1])
             start_date = entry["start"]
@@ -314,5 +298,3 @@ class TimeSeriesPredictor(Predictor):
 __all__ = ["TimeSeriesPredictor"]

 """Predictor implementation wrapping the TimeSeriesModel for GIFT-Eval."""
 import logging
+from collections.abc import Iterator
 import numpy as np
 import torch
 from src.models.model import TimeSeriesModel
 from src.utils.utils import device
 logger = logging.getLogger(__name__)
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
+        max_context_length: int | None = None,
         debug: bool = False,
     ) -> None:
         # Dataset-specific context (can be updated per dataset/term)
         self.config = config
         # Initialize scaler (using same type as model)
+        scaler_type = self.config.get("TimeSeriesModel", {}).get("scaler", "custom_robust")
         epsilon = self.config.get("TimeSeriesModel", {}).get("epsilon", 1e-3)
         if scaler_type == "custom_robust":
             self.scaler = RobustScaler(epsilon=epsilon)
     def set_dataset_context(
         self,
+        prediction_length: int | None = None,
+        freq: str | None = None,
+        batch_size: int | None = None,
+        max_context_length: int | None = None,
     ) -> None:
         """Update lightweight dataset-specific attributes without reloading the model."""
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
+        max_context_length: int | None = None,
         debug: bool = False,
     ) -> "TimeSeriesPredictor":
         return cls(
         ds_prediction_length: int,
         ds_freq: str,
         batch_size: int = 32,
+        max_context_length: int | None = None,
         debug: bool = False,
     ) -> "TimeSeriesPredictor":
+        with open(config_path) as f:
             config = yaml.safe_load(f)
         model = cls._load_model_from_path(config=config, model_path=model_path)
         return cls(
                 seq_len = min(seq_len, self.max_context_length)
             return seq_len
+        length_to_items: dict[int, list[tuple[int, object]]] = {}
         for idx, entry in enumerate(test_data_input):
             seq_len = _effective_length(entry)
             length_to_items.setdefault(seq_len, []).append((idx, entry))
         total = len(test_data_input)
+        ordered_results: list[QuantileForecast | None] = [None] * total
         for _, items in length_to_items.items():
             for i in range(0, len(items), self.batch_size):
         return ordered_results  # type: ignore[return-value]
+    def _predict_batch(self, test_data_batch: list) -> list[QuantileForecast]:
         """Generate predictions for a batch of time series."""
         logger.debug(f"Processing batch of size: {len(test_data_batch)}")
                 with torch.no_grad():
                     model_output = self.model(batch_container, drop_enc_allow=False)
+            forecasts = self._convert_to_forecasts(model_output, test_data_batch, batch_container)
             logger.debug(f"Generated {len(forecasts)} forecasts")
             return forecasts
             logger.error(f"Error in batch prediction: {exc}")
             raise
+    def _convert_to_batch_container(self, test_data_batch: list) -> BatchTimeSeriesContainer:
         """Convert gluonts test data to BatchTimeSeriesContainer."""
         batch_size = len(test_data_batch)
             else:
                 target = target.T
+            if self.max_context_length is not None and len(target) > self.max_context_length:
                 target = target[-self.max_context_length :]
             history_values_list.append(target)
         history_values_np = np.stack(history_values_list, axis=0)
         num_channels = history_values_np.shape[2]
+        history_values = torch.tensor(history_values_np, dtype=torch.float32, device=device)
         future_values = torch.zeros(
             (batch_size, self.ds_prediction_length, num_channels),
     def _convert_to_forecasts(
         self,
         model_output: dict,
+        test_data_batch: list,
         batch_container: BatchTimeSeriesContainer,
+    ) -> list[QuantileForecast]:
         """Convert model predictions to QuantileForecast objects."""
         predictions = model_output["result"]
         scale_statistics = model_output["scale_statistics"]
         if predictions.ndim == 4:
+            predictions_unscaled = self.scaler.inverse_scale(predictions, scale_statistics)
             is_quantile = True
             quantile_levels = self.model.quantiles
         else:
+            predictions_unscaled = self.scaler.inverse_scale(predictions, scale_statistics)
             is_quantile = False
             quantile_levels = [0.5]
+        forecasts: list[QuantileForecast] = []
         for idx, entry in enumerate(test_data_batch):
             history_length = int(batch_container.history_values.shape[1])
             start_date = entry["start"]
 __all__ = ["TimeSeriesPredictor"]

src/gift_eval/results.py CHANGED Viewed

@@ -5,7 +5,6 @@ import csv
 import glob
 import logging
 from pathlib import Path
-from typing import List, Optional
 import pandas as pd
@@ -18,7 +17,6 @@ from src.gift_eval.constants import (
 )
 from src.gift_eval.core import DatasetMetadata, EvaluationItem
 logger = logging.getLogger(__name__)
@@ -36,7 +34,7 @@ def _ensure_results_csv(csv_file_path: Path) -> None:
 def write_results_to_disk(
-    items: List[EvaluationItem],
     dataset_name: str,
     output_dir: Path,
     model_name: str,
@@ -56,17 +54,13 @@ def write_results_to_disk(
         writer = csv.writer(csvfile)
         for item in items:
             md: DatasetMetadata = item.dataset_metadata
-            metric_values: List[Optional[float]] = []
             for metric_name in STANDARD_METRIC_NAMES:
                 value = item.metrics.get(metric_name, None)
                 if value is None:
                     metric_values.append(None)
                 else:
-                    if (
-                        hasattr(value, "__len__")
-                        and not isinstance(value, (str, bytes))
-                        and len(value) == 1
-                    ):
                         value = value[0]
                     elif hasattr(value, "item"):
                         value = value.item()
@@ -75,9 +69,7 @@ def write_results_to_disk(
             ds_key = md.key.lower()
             props = DATASET_PROPERTIES.get(ds_key, {})
             domain = props.get("domain", "unknown")
-            num_variates = props.get(
-                "num_variates", 1 if md.to_univariate else md.target_dim
-            )
             row = [md.full_name, model_name] + metric_values + [domain, num_variates]
             writer.writerow(row)
@@ -99,11 +91,11 @@ def write_results_to_disk(
         logger.info("Plots saved under %s", output_dir / "plots")
-def get_all_datasets_full_name() -> List[str]:
     """Get all possible dataset full names for validation."""
     terms = ["short", "medium", "long"]
-    datasets_full_names: List[str] = []
     for name in ALL_DATASETS:
         for term in terms:
@@ -119,9 +111,7 @@ def get_all_datasets_full_name() -> List[str]:
                 ds_key = PRETTY_NAMES.get(ds_key, ds_key)
                 ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get("frequency")
-            datasets_full_names.append(
-                f"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}"
-            )
     return datasets_full_names
@@ -139,7 +129,7 @@ def aggregate_results(result_root_dir: str | Path) -> pd.DataFrame | None:
         logger.error("No result files found!")
         return None
-    dataframes: List[pd.DataFrame] = []
     for file in result_files:
         try:
             df = pd.read_csv(file)
@@ -159,26 +149,18 @@ def aggregate_results(result_root_dir: str | Path) -> pd.DataFrame | None:
     combined_df = pd.concat(dataframes, ignore_index=True).sort_values("dataset")
     if len(combined_df) != len(set(combined_df.dataset)):
-        duplicate_datasets = combined_df.dataset[
-            combined_df.dataset.duplicated()
-        ].tolist()
         logger.warning("Warning: Duplicate datasets found: %s", duplicate_datasets)
         combined_df = combined_df.drop_duplicates(subset=["dataset"], keep="first")
-        logger.info(
-            "Removed duplicates, %s unique datasets remaining", len(combined_df)
-        )
     logger.info("Combined results: %s datasets", len(combined_df))
     all_datasets_full_name = get_all_datasets_full_name()
     completed_experiments = combined_df.dataset.tolist()
-    completed_experiments_clean = [
-        exp for exp in completed_experiments if exp in all_datasets_full_name
-    ]
-    missing_or_failed_experiments = [
-        exp for exp in all_datasets_full_name if exp not in completed_experiments_clean
-    ]
     logger.info("=== EXPERIMENT SUMMARY ===")
     logger.info("Total expected datasets: %s", len(all_datasets_full_name))
@@ -195,9 +177,7 @@ def aggregate_results(result_root_dir: str | Path) -> pd.DataFrame | None:
             logger.info("  %3d: %s", idx, exp)
     completion_rate = (
-        len(completed_experiments_clean) / len(all_datasets_full_name) * 100
-        if all_datasets_full_name
-        else 0.0
     )
     logger.info("Completion rate: %.1f%%", completion_rate)
@@ -218,9 +198,7 @@ __all__ = [
 def main() -> None:
     """CLI entry point for aggregating results from disk."""
-    parser = argparse.ArgumentParser(
-        description="Aggregate GIFT-Eval results from multiple CSV files"
-    )
     parser.add_argument(
         "--result_root_dir",
         type=str,
@@ -231,13 +209,11 @@ def main() -> None:
     args = parser.parse_args()
     result_root_dir = Path(args.result_root_dir)
-    logging.basicConfig(
-        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-    )
     logger.info("Searching in directory: %s", result_root_dir)
     aggregate_results(result_root_dir=result_root_dir)
-if __name__ == "__main__":
-    main()

 import glob
 import logging
 from pathlib import Path
 import pandas as pd
 )
 from src.gift_eval.core import DatasetMetadata, EvaluationItem
 logger = logging.getLogger(__name__)
 def write_results_to_disk(
+    items: list[EvaluationItem],
     dataset_name: str,
     output_dir: Path,
     model_name: str,
         writer = csv.writer(csvfile)
         for item in items:
             md: DatasetMetadata = item.dataset_metadata
+            metric_values: list[float | None] = []
             for metric_name in STANDARD_METRIC_NAMES:
                 value = item.metrics.get(metric_name, None)
                 if value is None:
                     metric_values.append(None)
                 else:
+                    if hasattr(value, "__len__") and not isinstance(value, (str, bytes)) and len(value) == 1:
                         value = value[0]
                     elif hasattr(value, "item"):
                         value = value.item()
             ds_key = md.key.lower()
             props = DATASET_PROPERTIES.get(ds_key, {})
             domain = props.get("domain", "unknown")
+            num_variates = props.get("num_variates", 1 if md.to_univariate else md.target_dim)
             row = [md.full_name, model_name] + metric_values + [domain, num_variates]
             writer.writerow(row)
         logger.info("Plots saved under %s", output_dir / "plots")
+def get_all_datasets_full_name() -> list[str]:
     """Get all possible dataset full names for validation."""
     terms = ["short", "medium", "long"]
+    datasets_full_names: list[str] = []
     for name in ALL_DATASETS:
         for term in terms:
                 ds_key = PRETTY_NAMES.get(ds_key, ds_key)
                 ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get("frequency")
+            datasets_full_names.append(f"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}")
     return datasets_full_names
         logger.error("No result files found!")
         return None
+    dataframes: list[pd.DataFrame] = []
     for file in result_files:
         try:
             df = pd.read_csv(file)
     combined_df = pd.concat(dataframes, ignore_index=True).sort_values("dataset")
     if len(combined_df) != len(set(combined_df.dataset)):
+        duplicate_datasets = combined_df.dataset[combined_df.dataset.duplicated()].tolist()
         logger.warning("Warning: Duplicate datasets found: %s", duplicate_datasets)
         combined_df = combined_df.drop_duplicates(subset=["dataset"], keep="first")
+        logger.info("Removed duplicates, %s unique datasets remaining", len(combined_df))
     logger.info("Combined results: %s datasets", len(combined_df))
     all_datasets_full_name = get_all_datasets_full_name()
     completed_experiments = combined_df.dataset.tolist()
+    completed_experiments_clean = [exp for exp in completed_experiments if exp in all_datasets_full_name]
+    missing_or_failed_experiments = [exp for exp in all_datasets_full_name if exp not in completed_experiments_clean]
     logger.info("=== EXPERIMENT SUMMARY ===")
     logger.info("Total expected datasets: %s", len(all_datasets_full_name))
             logger.info("  %3d: %s", idx, exp)
     completion_rate = (
+        len(completed_experiments_clean) / len(all_datasets_full_name) * 100 if all_datasets_full_name else 0.0
     )
     logger.info("Completion rate: %.1f%%", completion_rate)
 def main() -> None:
     """CLI entry point for aggregating results from disk."""
+    parser = argparse.ArgumentParser(description="Aggregate GIFT-Eval results from multiple CSV files")
     parser.add_argument(
         "--result_root_dir",
         type=str,
     args = parser.parse_args()
     result_root_dir = Path(args.result_root_dir)
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
     logger.info("Searching in directory: %s", result_root_dir)
     aggregate_results(result_root_dir=result_root_dir)
+if __name__ == "__main__":
+    main()

src/models/blocks.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import torch
 import torch.nn as nn
 from src.models.gated_deltaproduct import GatedDeltaProductConfig
@@ -56,7 +55,5 @@ class GatedDeltaProductEncoder(nn.Module):
         Returns:
             Output tensor of same shape as input
         """
-        x, last_hidden_state, _ = self.encoder_layer(
-            x, output_attentions=True, initial_state=initial_state
-        )
         return x, last_hidden_state

 import torch.nn as nn
 from src.models.gated_deltaproduct import GatedDeltaProductConfig
         Returns:
             Output tensor of same shape as input
         """
+        x, last_hidden_state, _ = self.encoder_layer(x, output_attentions=True, initial_state=initial_state)
         return x, last_hidden_state

src/models/gated_deltaproduct/configuration_gated_deltaproduct.py CHANGED Viewed

@@ -76,6 +76,7 @@ class GatedDeltaProductConfig(PretrainedConfig):
                 "`fuse_linear_cross_entropy` is enabled, which can improves memory efficiency "
                 "at the potential cost of reduced precision. "
                 "If you observe issues like loss divergence, consider disabling this setting.",
             )
         # DeltaProduct specific
@@ -87,13 +88,9 @@ class GatedDeltaProductConfig(PretrainedConfig):
             if not isinstance(attn, dict):
                 raise ValueError("attn must be a dictionary")
             if "layers" not in attn:
-                raise ValueError(
-                    "Layer indices must be provided to initialize hybrid attention layers"
-                )
             if "num_heads" not in attn:
-                raise ValueError(
-                    "Number of heads must be provided to initialize hybrid attention layers"
-                )
             attn["num_kv_heads"] = attn.get("num_kv_heads", attn["num_heads"])
             attn["qkv_bias"] = attn.get("qkv_bias", False)
             attn["window_size"] = attn.get("window_size", None)

                 "`fuse_linear_cross_entropy` is enabled, which can improves memory efficiency "
                 "at the potential cost of reduced precision. "
                 "If you observe issues like loss divergence, consider disabling this setting.",
+                stacklevel=2,
             )
         # DeltaProduct specific
             if not isinstance(attn, dict):
                 raise ValueError("attn must be a dictionary")
             if "layers" not in attn:
+                raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
             if "num_heads" not in attn:
+                raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
             attn["num_kv_heads"] = attn.get("num_kv_heads", attn["num_heads"])
             attn["qkv_bias"] = attn.get("qkv_bias", False)
             attn["window_size"] = attn.get("window_size", None)

src/models/gated_deltaproduct/gated_deltaproduct.py CHANGED Viewed

@@ -1,11 +1,10 @@
-# -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 from __future__ import annotations
 import math
 import warnings
-from typing import TYPE_CHECKING, Dict, Optional, Tuple
 import torch
 import torch.nn as nn
@@ -70,22 +69,19 @@ class GatedDeltaProduct(nn.Module):
         self.key_dim = int(self.num_heads * self.head_k_dim)
         self.value_dim = int(self.num_v_heads * self.head_v_dim)
         self.layer_idx = layer_idx
-        self.init_hidden_state = nn.Parameter(
-            torch.randn(self.num_heads, self.head_dim, self.head_dim)
-        )
         # Consistency check: Ensure expand_v produces integer values
-        if not math.isclose(
-            self.num_v_heads * self.head_dim * expand_v, self.value_dim, rel_tol=1e-5
-        ):
             raise ValueError(
-                f"expand_v={expand_v} does not produce an integer value when multiplied by key_dim={self.key_dim}. "
-                f"Resulting value_dim would be {self.num_v_heads * self.head_dim * expand_v}, which is invalid for nn.Linear."
             )
         if self.num_v_heads > self.num_heads and self.num_v_heads % self.num_heads != 0:
-            raise ValueError(
-                f"num_v_heads={self.num_v_heads} must be divisible by num_heads={self.num_heads}."
-            )
         if not math.isclose(head_dim * expand_v, self.head_v_dim, rel_tol=1e-5):
             raise ValueError(
@@ -96,12 +92,8 @@ class GatedDeltaProduct(nn.Module):
         self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
         self.k_proj = nn.Linear(hidden_size, self.key_dim * num_householder, bias=False)
-        self.v_proj = nn.Linear(
-            hidden_size, self.value_dim * num_householder, bias=False
-        )
-        self.b_proj = nn.Linear(
-            hidden_size, self.num_v_heads * num_householder, bias=False
-        )
         if self.use_forget_gate:
             self.a_proj = nn.Linear(hidden_size, self.num_v_heads, bias=False)
@@ -112,10 +104,7 @@ class GatedDeltaProduct(nn.Module):
             dt_min = 0.001
             dt_max = 0.1
             dt_init_floor = 1e-4
-            dt = torch.exp(
-                torch.rand(self.num_v_heads) * (math.log(dt_max) - math.log(dt_min))
-                + math.log(dt_min)
-            )
             dt = torch.clamp(dt, min=dt_init_floor)
             # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
             inv_dt = dt + torch.log(-torch.expm1(-dt))
@@ -168,13 +157,13 @@ class GatedDeltaProduct(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Cache] = None,
-        initial_state: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        **kwargs: Unpack[Dict],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         if attention_mask is not None:
             assert len(attention_mask.shape) == 2, (
                 "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
@@ -196,9 +185,7 @@ class GatedDeltaProduct(nn.Module):
         cu_seqlens = kwargs.get("cu_seqlens", None)
         if attention_mask is not None:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
-            hidden_states = index_first_axis(
-                rearrange(hidden_states, "b s ... -> (b s) ..."), indices
-            ).unsqueeze(0)
         if self.use_short_conv:
             conv_state_q, conv_state_k, conv_state_v = None, None, None
@@ -243,9 +230,7 @@ class GatedDeltaProduct(nn.Module):
         if self.num_v_heads > self.num_heads:
             q, k = map(
-                lambda x: repeat(
-                    x, "... h d -> ... (h g) d", g=self.num_v_heads // self.num_heads
-                ),
                 (q, k),
             )
@@ -255,15 +240,11 @@ class GatedDeltaProduct(nn.Module):
         beta = rearrange(beta, "... l (n h) -> ... (l n) h", n=self.num_householder)
         if self.use_forget_gate:
-            g = -self.A_log.float().exp() * F.softplus(
-                self.a_proj(hidden_states).float() + self.dt_bias
-            )
         else:
             g = None
-        recurrent_state = (
-            last_state["recurrent_state"] if last_state is not None else None
-        )
         if mode == "chunk":
             o, recurrent_state = chunk_gated_delta_product(
                 q=q,
@@ -291,9 +272,7 @@ class GatedDeltaProduct(nn.Module):
                 g_new[:, :, 0] = g
                 g = rearrange(g_new, "... l n h -> ... (l n) h")
-            q_new = q.new_zeros(
-                q.shape[0], q.shape[1], self.num_householder, q.shape[2], q.shape[3]
-            )
             q_new[:, :, -1] = q
             q = rearrange(q_new, "... l n h d-> ... (l n) h d")
             if self.use_forget_gate:
@@ -305,9 +284,7 @@ class GatedDeltaProduct(nn.Module):
                     beta=beta,
                     initial_state=recurrent_state,
                     output_final_state=use_cache,
-                    cu_seqlens=cu_seqlens * self.num_householder
-                    if cu_seqlens is not None
-                    else None,
                     use_qk_l2norm_in_kernel=True,
                 )
             else:
@@ -318,29 +295,21 @@ class GatedDeltaProduct(nn.Module):
                     beta=beta,
                     initial_state=recurrent_state,
                     output_final_state=use_cache,
-                    cu_seqlens=cu_seqlens * self.num_householder
-                    if cu_seqlens is not None
-                    else None,
                     use_qk_l2norm_in_kernel=True,
                 )
-            o = rearrange(o, "... (l n) h d -> ... l n h d", n=self.num_householder)[
-                ..., -1, :, :
-            ].contiguous()
         if past_key_values is not None:
             past_key_values.update(
                 recurrent_state=recurrent_state,
-                conv_state=(conv_state_q, conv_state_k, conv_state_v)
-                if self.use_short_conv
-                else None,
                 layer_idx=self.layer_idx,
                 offset=q_len,
             )
         if self.use_gate:
-            g = rearrange(
-                self.g_proj(hidden_states), "... (h d) -> ... h d", d=self.head_v_dim
-            )
             o = self.o_norm(o, g)
         else:
             o = self.o_norm(o)

 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 from __future__ import annotations
 import math
 import warnings
+from typing import TYPE_CHECKING
 import torch
 import torch.nn as nn
         self.key_dim = int(self.num_heads * self.head_k_dim)
         self.value_dim = int(self.num_v_heads * self.head_v_dim)
         self.layer_idx = layer_idx
+        self.init_hidden_state = nn.Parameter(torch.randn(self.num_heads, self.head_dim, self.head_dim))
         # Consistency check: Ensure expand_v produces integer values
+        if not math.isclose(self.num_v_heads * self.head_dim * expand_v, self.value_dim, rel_tol=1e-5):
             raise ValueError(
+                f"expand_v={expand_v} does not produce an integer value when multiplied by key_dim={self.key_dim}. "(
+                    f"Resulting value_dim would be "
+                    f"{self.num_v_heads * self.head_dim * expand_v}, "
+                    "which is invalid for nn.Linear."
+                )
             )
         if self.num_v_heads > self.num_heads and self.num_v_heads % self.num_heads != 0:
+            raise ValueError(f"num_v_heads={self.num_v_heads} must be divisible by num_heads={self.num_heads}.")
         if not math.isclose(head_dim * expand_v, self.head_v_dim, rel_tol=1e-5):
             raise ValueError(
         self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
         self.k_proj = nn.Linear(hidden_size, self.key_dim * num_householder, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim * num_householder, bias=False)
+        self.b_proj = nn.Linear(hidden_size, self.num_v_heads * num_householder, bias=False)
         if self.use_forget_gate:
             self.a_proj = nn.Linear(hidden_size, self.num_v_heads, bias=False)
             dt_min = 0.001
             dt_max = 0.1
             dt_init_floor = 1e-4
+            dt = torch.exp(torch.rand(self.num_v_heads) * (math.log(dt_max) - math.log(dt_min)) + math.log(dt_min))
             dt = torch.clamp(dt, min=dt_init_floor)
             # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
             inv_dt = dt + torch.log(-torch.expm1(-dt))
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        initial_state: torch.Tensor | None = None,
+        use_cache: bool | None = False,
+        output_attentions: bool | None = False,
+        **kwargs: Unpack[dict],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, Cache | None]:
         if attention_mask is not None:
             assert len(attention_mask.shape) == 2, (
                 "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
         cu_seqlens = kwargs.get("cu_seqlens", None)
         if attention_mask is not None:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
+            hidden_states = index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices).unsqueeze(0)
         if self.use_short_conv:
             conv_state_q, conv_state_k, conv_state_v = None, None, None
         if self.num_v_heads > self.num_heads:
             q, k = map(
+                lambda x: repeat(x, "... h d -> ... (h g) d", g=self.num_v_heads // self.num_heads),
                 (q, k),
             )
         beta = rearrange(beta, "... l (n h) -> ... (l n) h", n=self.num_householder)
         if self.use_forget_gate:
+            g = -self.A_log.float().exp() * F.softplus(self.a_proj(hidden_states).float() + self.dt_bias)
         else:
             g = None
+        recurrent_state = last_state["recurrent_state"] if last_state is not None else None
         if mode == "chunk":
             o, recurrent_state = chunk_gated_delta_product(
                 q=q,
                 g_new[:, :, 0] = g
                 g = rearrange(g_new, "... l n h -> ... (l n) h")
+            q_new = q.new_zeros(q.shape[0], q.shape[1], self.num_householder, q.shape[2], q.shape[3])
             q_new[:, :, -1] = q
             q = rearrange(q_new, "... l n h d-> ... (l n) h d")
             if self.use_forget_gate:
                     beta=beta,
                     initial_state=recurrent_state,
                     output_final_state=use_cache,
+                    cu_seqlens=cu_seqlens * self.num_householder if cu_seqlens is not None else None,
                     use_qk_l2norm_in_kernel=True,
                 )
             else:
                     beta=beta,
                     initial_state=recurrent_state,
                     output_final_state=use_cache,
+                    cu_seqlens=cu_seqlens * self.num_householder if cu_seqlens is not None else None,
                     use_qk_l2norm_in_kernel=True,
                 )
+            o = rearrange(o, "... (l n) h d -> ... l n h d", n=self.num_householder)[..., -1, :, :].contiguous()
         if past_key_values is not None:
             past_key_values.update(
                 recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
                 layer_idx=self.layer_idx,
                 offset=q_len,
             )
         if self.use_gate:
+            g = rearrange(self.g_proj(hidden_states), "... (h d) -> ... h d", d=self.head_v_dim)
             o = self.o_norm(o, g)
         else:
             o = self.o_norm(o)

src/models/gated_deltaproduct/modeling_gated_deltaproduct.py CHANGED Viewed

@@ -1,8 +1,6 @@
-# -*- coding: utf-8 -*-
 from __future__ import annotations
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -27,9 +25,7 @@ class GatedDeltaProductBlock(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
-        self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(
-            config.hidden_size, eps=config.norm_eps
-        )
         if config.attn is not None and layer_idx in config.attn["layers"]:
             self.attn = Attention(
                 hidden_size=config.hidden_size,
@@ -57,9 +53,7 @@ class GatedDeltaProductBlock(nn.Module):
                 num_householder=config.num_householder,
                 layer_idx=layer_idx,
             )
-        self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(
-            config.hidden_size, eps=config.norm_eps
-        )
         self.mlp = GatedDeltaProductMLP(
             hidden_size=config.hidden_size,
             hidden_ratio=config.hidden_ratio,
@@ -71,15 +65,13 @@ class GatedDeltaProductBlock(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        initial_state: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[Dict],
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
         residual = hidden_states
         hidden_states = self.attn_norm(hidden_states)
         hidden_states, attentions, past_key_values = self.attn(

 from __future__ import annotations
+from typing import TYPE_CHECKING
 import torch
 import torch.nn as nn
         self.config = config
         self.layer_idx = layer_idx
+        self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
         if config.attn is not None and layer_idx in config.attn["layers"]:
             self.attn = Attention(
                 hidden_size=config.hidden_size,
                 num_householder=config.num_householder,
                 layer_idx=layer_idx,
             )
+        self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
         self.mlp = GatedDeltaProductMLP(
             hidden_size=config.hidden_size,
             hidden_ratio=config.hidden_ratio,
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | list[torch.FloatTensor] | None = None,
+        use_cache: bool | None = False,
+        output_attentions: bool | None = False,
+        initial_state: torch.FloatTensor | None = None,
+        **kwargs: Unpack[dict],
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
         residual = hidden_states
         hidden_states = self.attn_norm(hidden_states)
         hidden_states, attentions, past_key_values = self.attn(

src/models/model.py CHANGED Viewed

@@ -69,9 +69,7 @@ class TimeSeriesModel(nn.Module):
         if self.loss_type == "quantile" and self.quantiles is None:
             raise ValueError("Quantiles must be provided for quantile loss.")
         if self.quantiles:
-            self.register_buffer(
-                "qt", torch.tensor(self.quantiles, device=device).view(1, 1, 1, -1)
-            )
         # Validate configuration before initialization
         self._validate_configuration()
@@ -89,8 +87,7 @@ class TimeSeriesModel(nn.Module):
         if self.embed_size % self.encoder_config["num_heads"] != 0:
             raise ValueError(
-                f"embed_size ({self.embed_size}) must be divisible by "
-                f"num_heads ({self.encoder_config['num_heads']})"
             )
     def _init_embedding_layers(self):
@@ -141,10 +138,7 @@ class TimeSeriesModel(nn.Module):
         self.initial_hidden_state = nn.ParameterList(
             [
                 nn.Parameter(
-                    torch.randn(
-                        1, self.encoder_config["num_heads"], head_k_dim, head_v_dim
-                    )
-                    / head_k_dim,
                     requires_grad=True,
                 )
                 for _ in range(num_initial_hidden_states)
@@ -174,16 +168,12 @@ class TimeSeriesModel(nn.Module):
             "batch_size": batch_size,
         }
-    def _compute_scaling(
-        self, history_values: torch.Tensor, history_mask: torch.Tensor = None
-    ):
         """Compute scaling statistics and apply scaling."""
         scale_statistics = self.scaler.compute_statistics(history_values, history_mask)
         return scale_statistics
-    def _apply_scaling_and_masking(
-        self, values: torch.Tensor, scale_statistics: dict, mask: torch.Tensor = None
-    ):
         """Apply scaling and optional masking to values."""
         scaled_values = self.scaler.scale(values, scale_statistics)
@@ -191,9 +181,7 @@ class TimeSeriesModel(nn.Module):
             scaled_values = scaled_values * mask.unsqueeze(-1).float()
         if self.scaler_clamp_value is not None:
-            scaled_values = torch.clamp(
-                scaled_values, -self.scaler_clamp_value, self.scaler_clamp_value
-            )
         return scaled_values
@@ -208,9 +196,7 @@ class TimeSeriesModel(nn.Module):
         seq_len = time_features.shape[1]
         if (torch.rand(1).item() < self.encoding_dropout) and drop_enc_allow:
-            return torch.zeros(
-                batch_size, seq_len, num_channels, self.embed_size, device=device
-            ).to(torch.float32)
         pos_embed = self.time_feature_projection(time_features)
         return pos_embed.unsqueeze(2).expand(-1, -1, num_channels, -1)
@@ -232,9 +218,7 @@ class TimeSeriesModel(nn.Module):
         # Suppress padded time steps completely so padding is a pure batching artifact
         # history_mask: [B, S] -> broadcast to [B, S, 1, 1]
         if history_mask is not None:
-            mask_broadcast = (
-                history_mask.unsqueeze(-1).unsqueeze(-1).to(channel_embeddings.dtype)
-            )
             channel_embeddings = channel_embeddings * mask_broadcast
         batch_size, seq_len = scaled_history.shape[:2]
@@ -260,9 +244,7 @@ class TimeSeriesModel(nn.Module):
         # Vectorize across channels by merging the batch and channel dimensions.
         # [B, S, N, E] -> [B*N, S, E]
         channel_embedded = (
-            embedded.permute(0, 2, 1, 3)
-            .contiguous()
-            .view(batch_size * num_channels, seq_len, self.embed_size)
         )
         # Reshape target positional embeddings similarly: [B, P, N, E] -> [B*N, P, E]
@@ -276,23 +258,16 @@ class TimeSeriesModel(nn.Module):
         x = torch.concatenate([x, target_repr], dim=1)
         if self.encoder_config.get("weaving", True):
             # initial hidden state is learnable
-            hidden_state = torch.zeros_like(
-                self.initial_hidden_state[0].repeat(batch_size * num_channels, 1, 1, 1)
-            )
             for layer_idx, encoder_layer in enumerate(self.encoder_layers):
                 x, hidden_state = encoder_layer(
                     x,
-                    hidden_state
-                    + self.initial_hidden_state[layer_idx].repeat(
-                        batch_size * num_channels, 1, 1, 1
-                    ),
                 )
         else:
             # initial hidden state is separately learnable for each layer
             for layer_idx, encoder_layer in enumerate(self.encoder_layers):
-                initial_hidden_state = self.initial_hidden_state[layer_idx].repeat(
-                    batch_size * num_channels, 1, 1, 1
-                )
                 x, _ = encoder_layer(x, initial_hidden_state)
         # Use the last prediction_length positions
@@ -304,18 +279,14 @@ class TimeSeriesModel(nn.Module):
         # Original shape: [B*N, P, Q] where Q is num_quantiles or 1
         # Reshape the output back to [B, P, N, Q]
         output_dim = len(self.quantiles) if self.loss_type == "quantile" else 1
-        predictions = predictions.view(
-            batch_size, num_channels, prediction_length, output_dim
-        )
         predictions = predictions.permute(0, 2, 1, 3)  # [B, P, N, Q]
         # Squeeze the last dimension if not in quantile mode for backward compatibility
         if self.loss_type != "quantile":
             predictions = predictions.squeeze(-1)  # [B, P, N]
         return predictions
-    def forward(
-        self, data_container: BatchTimeSeriesContainer, drop_enc_allow: bool = False
-    ):
         """Main forward pass."""
         # Preprocess data
         preprocessed = self._preprocess_data(data_container)
@@ -332,9 +303,7 @@ class TimeSeriesModel(nn.Module):
         )
         # Compute scaling
-        scale_statistics = self._compute_scaling(
-            preprocessed["history_values"], preprocessed["history_mask"]
-        )
         # Apply scaling
         history_scaled = self._apply_scaling_and_masking(
@@ -346,9 +315,7 @@ class TimeSeriesModel(nn.Module):
         # Scale future values if present
         future_scaled = None
         if preprocessed["future_values"] is not None:
-            future_scaled = self.scaler.scale(
-                preprocessed["future_values"], scale_statistics
-            )
         # Get positional embeddings
         history_pos_embed = self._get_positional_embeddings(
@@ -365,9 +332,7 @@ class TimeSeriesModel(nn.Module):
         )
         # Compute embeddings
-        history_embed = self._compute_embeddings(
-            history_scaled, history_pos_embed, preprocessed["history_mask"]
-        )
         # Generate predictions
         predictions = self._generate_predictions(
@@ -418,7 +383,8 @@ class TimeSeriesModel(nn.Module):
         if self.loss_type == "huber":
             if predictions.shape != future_scaled.shape:
                 raise ValueError(
-                    f"Shape mismatch for Huber loss: predictions {predictions.shape} vs future_scaled {future_scaled.shape}"
                 )
             return nn.functional.huber_loss(predictions, future_scaled)
         elif self.loss_type == "quantile":

         if self.loss_type == "quantile" and self.quantiles is None:
             raise ValueError("Quantiles must be provided for quantile loss.")
         if self.quantiles:
+            self.register_buffer("qt", torch.tensor(self.quantiles, device=device).view(1, 1, 1, -1))
         # Validate configuration before initialization
         self._validate_configuration()
         if self.embed_size % self.encoder_config["num_heads"] != 0:
             raise ValueError(
+                f"embed_size ({self.embed_size}) must be divisible by num_heads ({self.encoder_config['num_heads']})"
             )
     def _init_embedding_layers(self):
         self.initial_hidden_state = nn.ParameterList(
             [
                 nn.Parameter(
+                    torch.randn(1, self.encoder_config["num_heads"], head_k_dim, head_v_dim) / head_k_dim,
                     requires_grad=True,
                 )
                 for _ in range(num_initial_hidden_states)
             "batch_size": batch_size,
         }
+    def _compute_scaling(self, history_values: torch.Tensor, history_mask: torch.Tensor = None):
         """Compute scaling statistics and apply scaling."""
         scale_statistics = self.scaler.compute_statistics(history_values, history_mask)
         return scale_statistics
+    def _apply_scaling_and_masking(self, values: torch.Tensor, scale_statistics: dict, mask: torch.Tensor = None):
         """Apply scaling and optional masking to values."""
         scaled_values = self.scaler.scale(values, scale_statistics)
             scaled_values = scaled_values * mask.unsqueeze(-1).float()
         if self.scaler_clamp_value is not None:
+            scaled_values = torch.clamp(scaled_values, -self.scaler_clamp_value, self.scaler_clamp_value)
         return scaled_values
         seq_len = time_features.shape[1]
         if (torch.rand(1).item() < self.encoding_dropout) and drop_enc_allow:
+            return torch.zeros(batch_size, seq_len, num_channels, self.embed_size, device=device).to(torch.float32)
         pos_embed = self.time_feature_projection(time_features)
         return pos_embed.unsqueeze(2).expand(-1, -1, num_channels, -1)
         # Suppress padded time steps completely so padding is a pure batching artifact
         # history_mask: [B, S] -> broadcast to [B, S, 1, 1]
         if history_mask is not None:
+            mask_broadcast = history_mask.unsqueeze(-1).unsqueeze(-1).to(channel_embeddings.dtype)
             channel_embeddings = channel_embeddings * mask_broadcast
         batch_size, seq_len = scaled_history.shape[:2]
         # Vectorize across channels by merging the batch and channel dimensions.
         # [B, S, N, E] -> [B*N, S, E]
         channel_embedded = (
+            embedded.permute(0, 2, 1, 3).contiguous().view(batch_size * num_channels, seq_len, self.embed_size)
         )
         # Reshape target positional embeddings similarly: [B, P, N, E] -> [B*N, P, E]
         x = torch.concatenate([x, target_repr], dim=1)
         if self.encoder_config.get("weaving", True):
             # initial hidden state is learnable
+            hidden_state = torch.zeros_like(self.initial_hidden_state[0].repeat(batch_size * num_channels, 1, 1, 1))
             for layer_idx, encoder_layer in enumerate(self.encoder_layers):
                 x, hidden_state = encoder_layer(
                     x,
+                    hidden_state + self.initial_hidden_state[layer_idx].repeat(batch_size * num_channels, 1, 1, 1),
                 )
         else:
             # initial hidden state is separately learnable for each layer
             for layer_idx, encoder_layer in enumerate(self.encoder_layers):
+                initial_hidden_state = self.initial_hidden_state[layer_idx].repeat(batch_size * num_channels, 1, 1, 1)
                 x, _ = encoder_layer(x, initial_hidden_state)
         # Use the last prediction_length positions
         # Original shape: [B*N, P, Q] where Q is num_quantiles or 1
         # Reshape the output back to [B, P, N, Q]
         output_dim = len(self.quantiles) if self.loss_type == "quantile" else 1
+        predictions = predictions.view(batch_size, num_channels, prediction_length, output_dim)
         predictions = predictions.permute(0, 2, 1, 3)  # [B, P, N, Q]
         # Squeeze the last dimension if not in quantile mode for backward compatibility
         if self.loss_type != "quantile":
             predictions = predictions.squeeze(-1)  # [B, P, N]
         return predictions
+    def forward(self, data_container: BatchTimeSeriesContainer, drop_enc_allow: bool = False):
         """Main forward pass."""
         # Preprocess data
         preprocessed = self._preprocess_data(data_container)
         )
         # Compute scaling
+        scale_statistics = self._compute_scaling(preprocessed["history_values"], preprocessed["history_mask"])
         # Apply scaling
         history_scaled = self._apply_scaling_and_masking(
         # Scale future values if present
         future_scaled = None
         if preprocessed["future_values"] is not None:
+            future_scaled = self.scaler.scale(preprocessed["future_values"], scale_statistics)
         # Get positional embeddings
         history_pos_embed = self._get_positional_embeddings(
         )
         # Compute embeddings
+        history_embed = self._compute_embeddings(history_scaled, history_pos_embed, preprocessed["history_mask"])
         # Generate predictions
         predictions = self._generate_predictions(
         if self.loss_type == "huber":
             if predictions.shape != future_scaled.shape:
                 raise ValueError(
+                    f"Shape mismatch for Huber loss: predictions {predictions.shape} "
+                    f"vs future_scaled {future_scaled.shape}"
                 )
             return nn.functional.huber_loss(predictions, future_scaled)
         elif self.loss_type == "quantile":

src/optim/lr_scheduler.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import math
 from enum import Enum
 from functools import partial
-from typing import Optional
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
@@ -128,9 +127,7 @@ def _get_cosine_schedule_with_warmup_lr_lambda(
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
     cosine_factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
     return max(min_lr_ratio, cosine_factor)
@@ -176,15 +173,11 @@ def _get_cosine_with_restarts_lr_lambda(
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
     if progress >= 1.0:
         return min_lr_ratio
-    cosine_factor = 0.5 * (
-        1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))
-    )
     return max(min_lr_ratio, cosine_factor)
@@ -230,7 +223,7 @@ def get_scheduler(
     optimizer: Optimizer,
     num_warmup_steps: int,
     num_training_steps: int,
-    scheduler_kwargs: Optional[dict] = None,
 ):
     """
     Unified interface to create learning rate schedulers.
@@ -303,15 +296,11 @@ class WarmupStableDecayScheduler:
             return 1.0
         else:
             # Decay phase
-            decay_steps = (
-                self.total_steps - self.num_warmup_steps - self.num_stable_steps
-            )
             if decay_steps <= 0:
                 return max(self.min_lr_ratio, 1.0)
-            progress = (
-                step - self.num_warmup_steps - self.num_stable_steps
-            ) / decay_steps
             progress = min(progress, 1.0)
             if self.decay_type == "cosine":
@@ -327,14 +316,12 @@ class WarmupStableDecayScheduler:
         """Update learning rates for all parameter groups."""
         lr_factor = self.get_lr_factor(self.current_step)
-        for param_group, base_lr in zip(self.optimizer.param_groups, self.base_lrs):
             param_group["lr"] = base_lr * lr_factor
         if self.verbose and self.current_step % 1000 == 0:
             phase = self.get_phase()
-            print(
-                f"Step {self.current_step}: LR factor = {lr_factor:.6f}, Phase = {phase}"
-            )
         self.current_step += 1

 import math
 from enum import Enum
 from functools import partial
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
     cosine_factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
     return max(min_lr_ratio, cosine_factor)
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
     if progress >= 1.0:
         return min_lr_ratio
+    cosine_factor = 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))
     return max(min_lr_ratio, cosine_factor)
     optimizer: Optimizer,
     num_warmup_steps: int,
     num_training_steps: int,
+    scheduler_kwargs: dict | None = None,
 ):
     """
     Unified interface to create learning rate schedulers.
             return 1.0
         else:
             # Decay phase
+            decay_steps = self.total_steps - self.num_warmup_steps - self.num_stable_steps
             if decay_steps <= 0:
                 return max(self.min_lr_ratio, 1.0)
+            progress = (step - self.num_warmup_steps - self.num_stable_steps) / decay_steps
             progress = min(progress, 1.0)
             if self.decay_type == "cosine":
         """Update learning rates for all parameter groups."""
         lr_factor = self.get_lr_factor(self.current_step)
+        for param_group, base_lr in zip(self.optimizer.param_groups, self.base_lrs, strict=True):
             param_group["lr"] = base_lr * lr_factor
         if self.verbose and self.current_step % 1000 == 0:
             phase = self.get_phase()
+            print(f"Step {self.current_step}: LR factor = {lr_factor:.6f}, Phase = {phase}")
         self.current_step += 1

src/plotting/gift_eval_utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import List, Optional, Tuple
 import numpy as np
 import pandas as pd
@@ -13,9 +12,7 @@ from src.plotting.plot_timeseries import (
 logger = logging.getLogger(__name__)
-def _prepare_data_for_plotting(
-    input_data: dict, label_data: dict, max_context_length: int
-):
     history_values = np.asarray(input_data["target"], dtype=np.float32)
     future_values = np.asarray(label_data["target"], dtype=np.float32)
     start_period = input_data["start"]
@@ -38,16 +35,14 @@ def _prepare_data_for_plotting(
     # Convert Period to Timestamp if needed
     start_timestamp = (
-        start_period.to_timestamp()
-        if hasattr(start_period, "to_timestamp")
-        else pd.Timestamp(start_period)
     )
     return history_values, future_values, start_timestamp
 def _extract_quantile_predictions(
     forecast,
-) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
     def ensure_2d_time_first(arr):
         if arr is None:
             return None
@@ -106,7 +101,7 @@ def _create_plot(
     dataset_full_name: str,
     dataset_freq: str,
     max_context_length: int,
-    title: Optional[str] = None,
 ):
     try:
         history_values, future_values, start_timestamp = _prepare_data_for_plotting(
@@ -140,9 +135,7 @@ def _create_plot(
                         pred_arr = pred_arr.T
                     else:
                         if pred_arr.size >= target_arr.shape[0]:
-                            pred_arr = pred_arr.flatten()[
-                                : target_arr.shape[0]
-                            ].reshape(-1, 1)
                             if target_arr.shape[1] > 1:
                                 pred_arr = np.broadcast_to(pred_arr, target_arr.shape)
             return pred_arr
@@ -171,20 +164,18 @@ def _create_plot(
 def create_plots_for_dataset(
-    forecasts: List,
     test_data,
     dataset_metadata,
     max_plots: int,
     max_context_length: int,
-) -> List[Tuple[object, str]]:
     input_data_list = list(test_data.input)
     label_data_list = list(test_data.label)
     num_plots = min(len(forecasts), max_plots)
-    logger.info(
-        f"Creating {num_plots} plots for {getattr(dataset_metadata, 'full_name', str(dataset_metadata))}"
-    )
-    figures_with_names: List[Tuple[object, str]] = []
     for i in range(num_plots):
         try:
             forecast = forecasts[i]
@@ -205,9 +196,7 @@ def create_plots_for_dataset(
                 title=title,
             )
             if fig is not None:
-                filename = (
-                    f"{getattr(dataset_metadata, 'freq', 'D')}_window_{i + 1:03d}.png"
-                )
                 figures_with_names.append((fig, filename))
         except Exception as e:
             logger.warning(f"Error creating plot for window {i + 1}: {e}")

 import logging
 import numpy as np
 import pandas as pd
 logger = logging.getLogger(__name__)
+def _prepare_data_for_plotting(input_data: dict, label_data: dict, max_context_length: int):
     history_values = np.asarray(input_data["target"], dtype=np.float32)
     future_values = np.asarray(label_data["target"], dtype=np.float32)
     start_period = input_data["start"]
     # Convert Period to Timestamp if needed
     start_timestamp = (
+        start_period.to_timestamp() if hasattr(start_period, "to_timestamp") else pd.Timestamp(start_period)
     )
     return history_values, future_values, start_timestamp
 def _extract_quantile_predictions(
     forecast,
+) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]:
     def ensure_2d_time_first(arr):
         if arr is None:
             return None
     dataset_full_name: str,
     dataset_freq: str,
     max_context_length: int,
+    title: str | None = None,
 ):
     try:
         history_values, future_values, start_timestamp = _prepare_data_for_plotting(
                         pred_arr = pred_arr.T
                     else:
                         if pred_arr.size >= target_arr.shape[0]:
+                            pred_arr = pred_arr.flatten()[: target_arr.shape[0]].reshape(-1, 1)
                             if target_arr.shape[1] > 1:
                                 pred_arr = np.broadcast_to(pred_arr, target_arr.shape)
             return pred_arr
 def create_plots_for_dataset(
+    forecasts: list,
     test_data,
     dataset_metadata,
     max_plots: int,
     max_context_length: int,
+) -> list[tuple[object, str]]:
     input_data_list = list(test_data.input)
     label_data_list = list(test_data.label)
     num_plots = min(len(forecasts), max_plots)
+    logger.info(f"Creating {num_plots} plots for {getattr(dataset_metadata, 'full_name', str(dataset_metadata))}")
+    figures_with_names: list[tuple[object, str]] = []
     for i in range(num_plots):
         try:
             forecast = forecasts[i]
                 title=title,
             )
             if fig is not None:
+                filename = f"{getattr(dataset_metadata, 'freq', 'D')}_window_{i + 1:03d}.png"
                 figures_with_names.append((fig, filename))
         except Exception as e:
             logger.warning(f"Error creating plot for window {i + 1}: {e}")

src/plotting/plot_timeseries.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import List, Optional, Tuple, Union
 import matplotlib.pyplot as plt
 import numpy as np
@@ -18,40 +17,30 @@ def calculate_smape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
     """Calculate Symmetric Mean Absolute Percentage Error (SMAPE)."""
     pred_tensor = torch.from_numpy(y_pred).float()
     true_tensor = torch.from_numpy(y_true).float()
-    return torchmetrics.SymmetricMeanAbsolutePercentageError()(
-        pred_tensor, true_tensor
-    ).item()
 def _create_date_ranges(
-    start: Optional[Union[np.datetime64, pd.Timestamp]],
-    frequency: Optional[Union[Frequency, str]],
     history_length: int,
     prediction_length: int,
-) -> Tuple[pd.DatetimeIndex, pd.DatetimeIndex]:
     """Create date ranges for history and future periods."""
     if start is not None and frequency is not None:
         start_timestamp = pd.Timestamp(start)
         pandas_freq = frequency.to_pandas_freq(for_date_range=True)
-        history_dates = pd.date_range(
-            start=start_timestamp, periods=history_length, freq=pandas_freq
-        )
         if prediction_length > 0:
-            next_timestamp = history_dates[-1] + pd.tseries.frequencies.to_offset(
-                pandas_freq
-            )
-            future_dates = pd.date_range(
-                start=next_timestamp, periods=prediction_length, freq=pandas_freq
-            )
         else:
             future_dates = pd.DatetimeIndex([])
     else:
         # Fallback to default daily frequency
-        history_dates = pd.date_range(
-            end=pd.Timestamp.now(), periods=history_length, freq="D"
-        )
         if prediction_length > 0:
             future_dates = pd.date_range(
@@ -71,16 +60,14 @@ def _plot_single_channel(
     history_dates: pd.DatetimeIndex,
     future_dates: pd.DatetimeIndex,
     history_values: np.ndarray,
-    future_values: Optional[np.ndarray] = None,
-    predicted_values: Optional[np.ndarray] = None,
-    lower_bound: Optional[np.ndarray] = None,
-    upper_bound: Optional[np.ndarray] = None,
 ) -> None:
     """Plot a single channel's time series data."""
     # Plot history
-    ax.plot(
-        history_dates, history_values[:, channel_idx], color="black", label="History"
-    )
     # Plot ground truth future
     if future_values is not None:
@@ -116,11 +103,9 @@ def _plot_single_channel(
     ax.grid(True, which="both", linestyle="--", linewidth=0.5)
-def _setup_figure(num_channels: int) -> Tuple[Figure, List[plt.Axes]]:
     """Create and configure the matplotlib figure and axes."""
-    fig, axes = plt.subplots(
-        num_channels, 1, figsize=(15, 3 * num_channels), sharex=True
-    )
     if num_channels == 1:
         axes = [axes]
     return fig, axes
@@ -128,10 +113,10 @@ def _setup_figure(num_channels: int) -> Tuple[Figure, List[plt.Axes]]:
 def _finalize_plot(
     fig: Figure,
-    axes: List[plt.Axes],
-    title: Optional[str] = None,
-    smape_value: Optional[float] = None,
-    output_file: Optional[str] = None,
     show: bool = True,
 ) -> None:
     """Add legend, title, and save/show the plot."""
@@ -159,15 +144,15 @@ def _finalize_plot(
 def plot_multivariate_timeseries(
     history_values: np.ndarray,
-    future_values: Optional[np.ndarray] = None,
-    predicted_values: Optional[np.ndarray] = None,
-    start: Optional[Union[np.datetime64, pd.Timestamp]] = None,
-    frequency: Optional[Union[Frequency, str]] = None,
-    title: Optional[str] = None,
-    output_file: Optional[str] = None,
     show: bool = True,
-    lower_bound: Optional[np.ndarray] = None,
-    upper_bound: Optional[np.ndarray] = None,
 ) -> Figure:
     """Plot a multivariate time series with history, future, predictions, and uncertainty bands."""
     # Calculate SMAPE if both predicted and true values are available
@@ -188,9 +173,7 @@ def plot_multivariate_timeseries(
     )
     # Create date ranges
-    history_dates, future_dates = _create_date_ranges(
-        start, frequency, history_length, prediction_length
-    )
     # Setup figure
     fig, axes = _setup_figure(num_channels)
@@ -217,8 +200,8 @@ def plot_multivariate_timeseries(
 def _extract_quantile_predictions(
     predicted_values: np.ndarray,
-    model_quantiles: List[float],
-) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
     """Extract median, lower, and upper bound predictions from quantile output."""
     try:
         median_idx = model_quantiles.index(0.5)
@@ -231,9 +214,7 @@ def _extract_quantile_predictions(
         return median_preds, lower_bound, upper_bound
     except (ValueError, IndexError):
-        logger.warning(
-            "Could not find 0.1, 0.5, 0.9 quantiles for plotting. Using median of available quantiles."
-        )
         median_preds = predicted_values[..., predicted_values.shape[-1] // 2]
         return median_preds, None, None
@@ -241,10 +222,10 @@ def _extract_quantile_predictions(
 def plot_from_container(
     batch: BatchTimeSeriesContainer,
     sample_idx: int,
-    predicted_values: Optional[np.ndarray] = None,
-    model_quantiles: Optional[List[float]] = None,
-    title: Optional[str] = None,
-    output_file: Optional[str] = None,
     show: bool = True,
 ) -> Figure:
     """Plot a single sample from a BatchTimeSeriesContainer with proper quantile handling."""
@@ -256,8 +237,7 @@ def plot_from_container(
     if predicted_values is not None:
         # Handle batch vs single sample predictions
         if predicted_values.ndim >= 3 or (
-            predicted_values.ndim == 2
-            and predicted_values.shape[0] > future_values.shape[0]
         ):
             sample_preds = predicted_values[sample_idx]
         else:
@@ -265,9 +245,7 @@ def plot_from_container(
         # Extract quantile information if available
         if model_quantiles:
-            median_preds, lower_bound, upper_bound = _extract_quantile_predictions(
-                sample_preds, model_quantiles
-            )
         else:
             median_preds = sample_preds
             lower_bound = None

 import logging
 import matplotlib.pyplot as plt
 import numpy as np
     """Calculate Symmetric Mean Absolute Percentage Error (SMAPE)."""
     pred_tensor = torch.from_numpy(y_pred).float()
     true_tensor = torch.from_numpy(y_true).float()
+    return torchmetrics.SymmetricMeanAbsolutePercentageError()(pred_tensor, true_tensor).item()
 def _create_date_ranges(
+    start: np.datetime64 | pd.Timestamp | None,
+    frequency: Frequency | str | None,
     history_length: int,
     prediction_length: int,
+) -> tuple[pd.DatetimeIndex, pd.DatetimeIndex]:
     """Create date ranges for history and future periods."""
     if start is not None and frequency is not None:
         start_timestamp = pd.Timestamp(start)
         pandas_freq = frequency.to_pandas_freq(for_date_range=True)
+        history_dates = pd.date_range(start=start_timestamp, periods=history_length, freq=pandas_freq)
         if prediction_length > 0:
+            next_timestamp = history_dates[-1] + pd.tseries.frequencies.to_offset(pandas_freq)
+            future_dates = pd.date_range(start=next_timestamp, periods=prediction_length, freq=pandas_freq)
         else:
             future_dates = pd.DatetimeIndex([])
     else:
         # Fallback to default daily frequency
+        history_dates = pd.date_range(end=pd.Timestamp.now(), periods=history_length, freq="D")
         if prediction_length > 0:
             future_dates = pd.date_range(
     history_dates: pd.DatetimeIndex,
     future_dates: pd.DatetimeIndex,
     history_values: np.ndarray,
+    future_values: np.ndarray | None = None,
+    predicted_values: np.ndarray | None = None,
+    lower_bound: np.ndarray | None = None,
+    upper_bound: np.ndarray | None = None,
 ) -> None:
     """Plot a single channel's time series data."""
     # Plot history
+    ax.plot(history_dates, history_values[:, channel_idx], color="black", label="History")
     # Plot ground truth future
     if future_values is not None:
     ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+def _setup_figure(num_channels: int) -> tuple[Figure, list[plt.Axes]]:
     """Create and configure the matplotlib figure and axes."""
+    fig, axes = plt.subplots(num_channels, 1, figsize=(15, 3 * num_channels), sharex=True)
     if num_channels == 1:
         axes = [axes]
     return fig, axes
 def _finalize_plot(
     fig: Figure,
+    axes: list[plt.Axes],
+    title: str | None = None,
+    smape_value: float | None = None,
+    output_file: str | None = None,
     show: bool = True,
 ) -> None:
     """Add legend, title, and save/show the plot."""
 def plot_multivariate_timeseries(
     history_values: np.ndarray,
+    future_values: np.ndarray | None = None,
+    predicted_values: np.ndarray | None = None,
+    start: np.datetime64 | pd.Timestamp | None = None,
+    frequency: Frequency | str | None = None,
+    title: str | None = None,
+    output_file: str | None = None,
     show: bool = True,
+    lower_bound: np.ndarray | None = None,
+    upper_bound: np.ndarray | None = None,
 ) -> Figure:
     """Plot a multivariate time series with history, future, predictions, and uncertainty bands."""
     # Calculate SMAPE if both predicted and true values are available
     )
     # Create date ranges
+    history_dates, future_dates = _create_date_ranges(start, frequency, history_length, prediction_length)
     # Setup figure
     fig, axes = _setup_figure(num_channels)
 def _extract_quantile_predictions(
     predicted_values: np.ndarray,
+    model_quantiles: list[float],
+) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]:
     """Extract median, lower, and upper bound predictions from quantile output."""
     try:
         median_idx = model_quantiles.index(0.5)
         return median_preds, lower_bound, upper_bound
     except (ValueError, IndexError):
+        logger.warning("Could not find 0.1, 0.5, 0.9 quantiles for plotting. Using median of available quantiles.")
         median_preds = predicted_values[..., predicted_values.shape[-1] // 2]
         return median_preds, None, None
 def plot_from_container(
     batch: BatchTimeSeriesContainer,
     sample_idx: int,
+    predicted_values: np.ndarray | None = None,
+    model_quantiles: list[float] | None = None,
+    title: str | None = None,
+    output_file: str | None = None,
     show: bool = True,
 ) -> Figure:
     """Plot a single sample from a BatchTimeSeriesContainer with proper quantile handling."""
     if predicted_values is not None:
         # Handle batch vs single sample predictions
         if predicted_values.ndim >= 3 or (
+            predicted_values.ndim == 2 and predicted_values.shape[0] > future_values.shape[0]
         ):
             sample_preds = predicted_values[sample_idx]
         else:
         # Extract quantile information if available
         if model_quantiles:
+            median_preds, lower_bound, upper_bound = _extract_quantile_predictions(sample_preds, model_quantiles)
         else:
             median_preds = sample_preds
             lower_bound = None

src/synthetic_generation/abstract_classes.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
 import numpy as np
 import torch
@@ -18,7 +18,7 @@ class AbstractTimeSeriesGenerator(ABC):
     """
     @abstractmethod
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         """
         Generate synthetic time series data.
@@ -64,7 +64,7 @@ class GeneratorWrapper:
         np.random.seed(seed)
         torch.manual_seed(seed)
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         """
         Sample parameters with total_length fixed and history_length calculated.
@@ -76,14 +76,8 @@ class GeneratorWrapper:
         """
         # Select a suitable frequency based on the total length
-        frequency = [
-            select_safe_random_frequency(self.params.length, self.rng)
-            for _ in range(batch_size)
-        ]
-        start = [
-            select_safe_start_date(self.params.length, frequency[i], self.rng)
-            for i in range(batch_size)
-        ]
         return {
             "frequency": frequency,
@@ -91,7 +85,5 @@ class GeneratorWrapper:
         }
     @abstractmethod
-    def generate_batch(
-        self, batch_size: int, seed: Optional[int] = None, **kwargs
-    ) -> TimeSeriesContainer:
         raise NotImplementedError("Subclasses must implement generate_batch()")

 from abc import ABC, abstractmethod
+from typing import Any
 import numpy as np
 import torch
     """
     @abstractmethod
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         """
         Generate synthetic time series data.
         np.random.seed(seed)
         torch.manual_seed(seed)
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         """
         Sample parameters with total_length fixed and history_length calculated.
         """
         # Select a suitable frequency based on the total length
+        frequency = [select_safe_random_frequency(self.params.length, self.rng) for _ in range(batch_size)]
+        start = [select_safe_start_date(self.params.length, frequency[i], self.rng) for i in range(batch_size)]
         return {
             "frequency": frequency,
         }
     @abstractmethod
+    def generate_batch(self, batch_size: int, seed: int | None = None, **kwargs) -> TimeSeriesContainer:
         raise NotImplementedError("Subclasses must implement generate_batch()")

src/synthetic_generation/anomalies/anomaly_generator.py CHANGED Viewed

@@ -1,7 +1,4 @@
-from typing import List, Optional, Set
 import numpy as np
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.generator_params import (
     AnomalyGeneratorParams,
@@ -43,7 +40,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         else:
             return AnomalyType.SPIKE_DOWN
-    def _generate_spike_positions(self) -> List[List[int]]:
         """
         Generate spike positions:
         - Always create uniformly spaced single spikes (base schedule)
@@ -62,7 +59,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         base_positions = list(range(start_position, self.params.length, base_period))
         # Start with single-spike events at base positions
-        spike_events: List[List[int]] = [[pos] for pos in base_positions]
         if not base_positions:
             return spike_events
@@ -73,9 +70,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         # 25%: augment with clusters near some base spikes
         if series_draw < self.params.cluster_series_probability:
             num_base_events = len(base_positions)
-            num_to_augment = max(
-                1, int(round(self.params.cluster_event_fraction * num_base_events))
-            )
             num_to_augment = min(num_to_augment, num_base_events)
             chosen_indices = (
@@ -87,9 +82,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
             for idx in chosen_indices:
                 base_pos = base_positions[int(idx)]
                 # Number of additional spikes (1..3) per selected event
-                num_additional = np.random.randint(
-                    *self.params.cluster_additional_spikes_range
-                )
                 if num_additional <= 0:
                     continue
@@ -101,7 +94,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
                 )
                 offsets = [int(off) for off in offsets if off != 0]
-                cluster_positions: Set[int] = set([base_pos])
                 for off in offsets:
                     pos = base_pos + off
                     if 0 <= pos < self.params.length:
@@ -110,23 +103,16 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
                 spike_events[int(idx)] = sorted(cluster_positions)
         # Next 25%: add random single spikes across the series
-        elif series_draw < (
-            self.params.cluster_series_probability
-            + self.params.random_series_probability
-        ):
             num_base_events = len(base_positions)
-            num_random = int(
-                round(self.params.random_spike_fraction_of_base * num_base_events)
-            )
             if num_random > 0:
                 all_indices = np.arange(self.params.length)
                 base_array = np.array(base_positions, dtype=int)
                 candidates = np.setdiff1d(all_indices, base_array, assume_unique=False)
                 if candidates.size > 0:
                     choose_n = min(num_random, candidates.size)
-                    rand_positions = np.random.choice(
-                        candidates, size=choose_n, replace=False
-                    )
                     for pos in rand_positions:
                         spike_events.append([int(pos)])
@@ -154,9 +140,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         if self.params.magnitude_pattern == MagnitudePattern.CONSTANT:
             # All spikes have similar magnitude with small noise
             magnitudes = np.full(total_spikes, base_magnitude)
-            noise = np.random.normal(
-                0, self.params.magnitude_noise * base_magnitude, total_spikes
-            )
             magnitudes += noise
         elif self.params.magnitude_pattern == MagnitudePattern.INCREASING:
@@ -183,9 +167,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
             if cycle_length == 0:
                 cycle_length = max(1, total_spikes // 4)
-            phase = np.linspace(
-                0, 2 * np.pi * total_spikes / cycle_length, total_spikes
-            )
             cyclical_component = 0.3 * base_magnitude * np.sin(phase)
             magnitudes = base_magnitude + cyclical_component
@@ -205,9 +187,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
                 )
         # Add noise to all patterns
-        noise = np.random.normal(
-            0, self.params.magnitude_noise * base_magnitude, total_spikes
-        )
         magnitudes += noise
         # Ensure magnitudes are positive and within reasonable bounds
@@ -217,9 +197,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         return magnitudes
-    def _inject_spike_anomalies(
-        self, signal: np.ndarray, spike_direction: AnomalyType
-    ) -> np.ndarray:
         """
         Inject spike anomalies into the clean signal using realistic patterns.
@@ -263,7 +241,7 @@ class AnomalyGenerator(AbstractTimeSeriesGenerator):
         return anomalous_signal
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         """
         Generate a synthetic time series with realistic spike anomalies.

 import numpy as np
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.generator_params import (
     AnomalyGeneratorParams,
         else:
             return AnomalyType.SPIKE_DOWN
+    def _generate_spike_positions(self) -> list[list[int]]:
         """
         Generate spike positions:
         - Always create uniformly spaced single spikes (base schedule)
         base_positions = list(range(start_position, self.params.length, base_period))
         # Start with single-spike events at base positions
+        spike_events: list[list[int]] = [[pos] for pos in base_positions]
         if not base_positions:
             return spike_events
         # 25%: augment with clusters near some base spikes
         if series_draw < self.params.cluster_series_probability:
             num_base_events = len(base_positions)
+            num_to_augment = max(1, int(round(self.params.cluster_event_fraction * num_base_events)))
             num_to_augment = min(num_to_augment, num_base_events)
             chosen_indices = (
             for idx in chosen_indices:
                 base_pos = base_positions[int(idx)]
                 # Number of additional spikes (1..3) per selected event
+                num_additional = np.random.randint(*self.params.cluster_additional_spikes_range)
                 if num_additional <= 0:
                     continue
                 )
                 offsets = [int(off) for off in offsets if off != 0]
+                cluster_positions: set[int] = {base_pos}
                 for off in offsets:
                     pos = base_pos + off
                     if 0 <= pos < self.params.length:
                 spike_events[int(idx)] = sorted(cluster_positions)
         # Next 25%: add random single spikes across the series
+        elif series_draw < (self.params.cluster_series_probability + self.params.random_series_probability):
             num_base_events = len(base_positions)
+            num_random = int(round(self.params.random_spike_fraction_of_base * num_base_events))
             if num_random > 0:
                 all_indices = np.arange(self.params.length)
                 base_array = np.array(base_positions, dtype=int)
                 candidates = np.setdiff1d(all_indices, base_array, assume_unique=False)
                 if candidates.size > 0:
                     choose_n = min(num_random, candidates.size)
+                    rand_positions = np.random.choice(candidates, size=choose_n, replace=False)
                     for pos in rand_positions:
                         spike_events.append([int(pos)])
         if self.params.magnitude_pattern == MagnitudePattern.CONSTANT:
             # All spikes have similar magnitude with small noise
             magnitudes = np.full(total_spikes, base_magnitude)
+            noise = np.random.normal(0, self.params.magnitude_noise * base_magnitude, total_spikes)
             magnitudes += noise
         elif self.params.magnitude_pattern == MagnitudePattern.INCREASING:
             if cycle_length == 0:
                 cycle_length = max(1, total_spikes // 4)
+            phase = np.linspace(0, 2 * np.pi * total_spikes / cycle_length, total_spikes)
             cyclical_component = 0.3 * base_magnitude * np.sin(phase)
             magnitudes = base_magnitude + cyclical_component
                 )
         # Add noise to all patterns
+        noise = np.random.normal(0, self.params.magnitude_noise * base_magnitude, total_spikes)
         magnitudes += noise
         # Ensure magnitudes are positive and within reasonable bounds
         return magnitudes
+    def _inject_spike_anomalies(self, signal: np.ndarray, spike_direction: AnomalyType) -> np.ndarray:
         """
         Inject spike anomalies into the clean signal using realistic patterns.
         return anomalous_signal
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         """
         Generate a synthetic time series with realistic spike anomalies.

src/synthetic_generation/anomalies/anomaly_generator_wrapper.py CHANGED Viewed

@@ -1,7 +1,4 @@
-from typing import Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.anomalies.anomaly_generator import AnomalyGenerator
@@ -25,9 +22,7 @@ class AnomalyGeneratorWrapper(GeneratorWrapper):
         super().__init__(params)
         self.generator = AnomalyGenerator(params)
-    def generate_batch(
-        self, batch_size: int, seed: Optional[int] = None
-    ) -> TimeSeriesContainer:
         """
         Generate a batch of anomaly time series.

 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.anomalies.anomaly_generator import AnomalyGenerator
         super().__init__(params)
         self.generator = AnomalyGenerator(params)
+    def generate_batch(self, batch_size: int, seed: int | None = None) -> TimeSeriesContainer:
         """
         Generate a batch of anomaly time series.

src/synthetic_generation/audio_generators/financial_volatility_generator.py CHANGED Viewed

@@ -1,8 +1,5 @@
-from typing import Optional
 import numpy as np
 from pyo import LFO, BrownNoise, Follower, Metro, Mix, Sine, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
@@ -35,7 +32,7 @@ class FinancialVolatilityAudioGenerator(AbstractTimeSeriesGenerator):
         jump_env_decay_time_range: tuple[float, float],
         jump_freq_range: tuple[float, float],
         jump_direction_up_probability: float,
-        random_seed: Optional[int] = None,
     ):
         self.length = length
         self.server_duration = server_duration
@@ -66,9 +63,7 @@ class FinancialVolatilityAudioGenerator(AbstractTimeSeriesGenerator):
         follower_freq = self.rng.uniform(*self.follower_freq_range)
         volatility_min, volatility_max = self.volatility_range
         volatility_osc = Sine(freq=carrier_freq)
-        volatility = Follower(volatility_osc, freq=follower_freq).range(
-            volatility_min, volatility_max
-        )
         market_noise = BrownNoise(mul=volatility)
         # Jumps
@@ -76,19 +71,15 @@ class FinancialVolatilityAudioGenerator(AbstractTimeSeriesGenerator):
         jump_env_start = self.rng.uniform(*self.jump_env_start_range)
         jump_env_decay = self.rng.uniform(*self.jump_env_decay_time_range)
         jump_freq = self.rng.uniform(*self.jump_freq_range)
-        direction = (
-            1.0 if self.rng.random() < self.jump_direction_up_probability else -1.0
-        )
         jump_trigger = Metro(time=jump_time).play()
-        jump_env = TrigExpseg(
-            jump_trigger, list=[(0.0, jump_env_start), (jump_env_decay, 0.0)]
-        )
         jumps = Sine(freq=jump_freq, mul=jump_env * direction)
         return Mix([trend, market_noise, jumps], voices=1)
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

 import numpy as np
 from pyo import LFO, BrownNoise, Follower, Metro, Mix, Sine, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
         jump_env_decay_time_range: tuple[float, float],
         jump_freq_range: tuple[float, float],
         jump_direction_up_probability: float,
+        random_seed: int | None = None,
     ):
         self.length = length
         self.server_duration = server_duration
         follower_freq = self.rng.uniform(*self.follower_freq_range)
         volatility_min, volatility_max = self.volatility_range
         volatility_osc = Sine(freq=carrier_freq)
+        volatility = Follower(volatility_osc, freq=follower_freq).range(volatility_min, volatility_max)
         market_noise = BrownNoise(mul=volatility)
         # Jumps
         jump_env_start = self.rng.uniform(*self.jump_env_start_range)
         jump_env_decay = self.rng.uniform(*self.jump_env_decay_time_range)
         jump_freq = self.rng.uniform(*self.jump_freq_range)
+        direction = 1.0 if self.rng.random() < self.jump_direction_up_probability else -1.0
         jump_trigger = Metro(time=jump_time).play()
+        jump_env = TrigExpseg(jump_trigger, list=[(0.0, jump_env_start), (jump_env_decay, 0.0)])
         jumps = Sine(freq=jump_freq, mul=jump_env * direction)
         return Mix([trend, market_noise, jumps], voices=1)
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

src/synthetic_generation/audio_generators/financial_volatility_wrapper.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.financial_volatility_generator import (
@@ -15,7 +14,7 @@ class FinancialVolatilityAudioWrapper(GeneratorWrapper):
         super().__init__(params)
         self.params: FinancialVolatilityAudioParams = params
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
@@ -43,8 +42,8 @@ class FinancialVolatilityAudioWrapper(GeneratorWrapper):
     def generate_batch(
         self,
         batch_size: int,
-        seed: Optional[int] = None,
-        params: Optional[Dict[str, Any]] = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

+from typing import Any
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.financial_volatility_generator import (
         super().__init__(params)
         self.params: FinancialVolatilityAudioParams = params
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
     def generate_batch(
         self,
         batch_size: int,
+        seed: int | None = None,
+        params: dict[str, Any] | None = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

src/synthetic_generation/audio_generators/multi_scale_fractal_generator.py CHANGED Viewed

@@ -1,8 +1,5 @@
-from typing import Optional
 import numpy as np
 from pyo import Biquad, BrownNoise, Mix
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
@@ -27,7 +24,7 @@ class MultiScaleFractalAudioGenerator(AbstractTimeSeriesGenerator):
         scale_freq_base_range: tuple[float, float],
         q_factor_range: tuple[float, float],
         per_scale_attenuation_range: tuple[float, float],
-        random_seed: Optional[int] = None,
     ):
         self.length = length
         self.server_duration = server_duration
@@ -46,9 +43,7 @@ class MultiScaleFractalAudioGenerator(AbstractTimeSeriesGenerator):
         base_mul = self.rng.uniform(*self.base_noise_mul_range)
         base = BrownNoise(mul=base_mul)
-        num_scales = int(
-            self.rng.integers(self.num_scales_range[0], self.num_scales_range[1] + 1)
-        )
         scales = []
         for i in range(num_scales):
@@ -60,7 +55,7 @@ class MultiScaleFractalAudioGenerator(AbstractTimeSeriesGenerator):
         return Mix(scales, voices=1)
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

 import numpy as np
 from pyo import Biquad, BrownNoise, Mix
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
         scale_freq_base_range: tuple[float, float],
         q_factor_range: tuple[float, float],
         per_scale_attenuation_range: tuple[float, float],
+        random_seed: int | None = None,
     ):
         self.length = length
         self.server_duration = server_duration
         base_mul = self.rng.uniform(*self.base_noise_mul_range)
         base = BrownNoise(mul=base_mul)
+        num_scales = int(self.rng.integers(self.num_scales_range[0], self.num_scales_range[1] + 1))
         scales = []
         for i in range(num_scales):
         return Mix(scales, voices=1)
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

src/synthetic_generation/audio_generators/multi_scale_fractal_wrapper.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.multi_scale_fractal_generator import (
@@ -15,7 +14,7 @@ class MultiScaleFractalAudioWrapper(GeneratorWrapper):
         super().__init__(params)
         self.params: MultiScaleFractalAudioParams = params
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
@@ -35,8 +34,8 @@ class MultiScaleFractalAudioWrapper(GeneratorWrapper):
     def generate_batch(
         self,
         batch_size: int,
-        seed: Optional[int] = None,
-        params: Optional[Dict[str, Any]] = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

+from typing import Any
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.multi_scale_fractal_generator import (
         super().__init__(params)
         self.params: MultiScaleFractalAudioParams = params
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
     def generate_batch(
         self,
         batch_size: int,
+        seed: int | None = None,
+        params: dict[str, Any] | None = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

src/synthetic_generation/audio_generators/network_topology_generator.py CHANGED Viewed

@@ -1,8 +1,5 @@
-from typing import Optional, Tuple
 import numpy as np
 from pyo import LFO, BrownNoise, Metro, Mix, Noise, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
@@ -33,11 +30,9 @@ class NetworkTopologyAudioGenerator(AbstractTimeSeriesGenerator):
         overhead_lfo_freq_range: tuple[float, float],
         overhead_mul_range: tuple[float, float],
         attack_period_range: tuple[float, float],
-        attack_env_points: Tuple[
-            Tuple[float, float], Tuple[float, float], Tuple[float, float]
-        ],
         attack_mul_range: tuple[float, float],
-        random_seed: Optional[int] = None,
     ):
         self.length = length
         self.server_duration = server_duration
@@ -98,7 +93,7 @@ class NetworkTopologyAudioGenerator(AbstractTimeSeriesGenerator):
         return Mix([traffic_base, bursts, congestion_env, overhead, attacks], voices=1)
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

 import numpy as np
 from pyo import LFO, BrownNoise, Metro, Mix, Noise, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
         overhead_lfo_freq_range: tuple[float, float],
         overhead_mul_range: tuple[float, float],
         attack_period_range: tuple[float, float],
+        attack_env_points: tuple[tuple[float, float], tuple[float, float], tuple[float, float]],
         attack_mul_range: tuple[float, float],
+        random_seed: int | None = None,
     ):
         self.length = length
         self.server_duration = server_duration
         return Mix([traffic_base, bursts, congestion_env, overhead, attacks], voices=1)
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

src/synthetic_generation/audio_generators/network_topology_wrapper.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.network_topology_generator import (
@@ -15,7 +14,7 @@ class NetworkTopologyAudioWrapper(GeneratorWrapper):
         super().__init__(params)
         self.params: NetworkTopologyAudioParams = params
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
@@ -43,8 +42,8 @@ class NetworkTopologyAudioWrapper(GeneratorWrapper):
     def generate_batch(
         self,
         batch_size: int,
-        seed: Optional[int] = None,
-        params: Optional[Dict[str, Any]] = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

+from typing import Any
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.network_topology_generator import (
         super().__init__(params)
         self.params: NetworkTopologyAudioParams = params
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
     def generate_batch(
         self,
         batch_size: int,
+        seed: int | None = None,
+        params: dict[str, Any] | None = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

src/synthetic_generation/audio_generators/stochastic_rhythm_generator.py CHANGED Viewed

@@ -1,8 +1,5 @@
-from typing import Optional
 import numpy as np
 from pyo import Metro, Mix, Sine, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
@@ -29,7 +26,7 @@ class StochasticRhythmAudioGenerator(AbstractTimeSeriesGenerator):
         decay_range: tuple[float, float],
         tone_freq_range: tuple[float, float],
         tone_mul_range: tuple[float, float],
-        random_seed: Optional[int] = None,
     ):
         self.length = length
         self.server_duration = server_duration
@@ -48,15 +45,11 @@ class StochasticRhythmAudioGenerator(AbstractTimeSeriesGenerator):
     def _build_synth(self):
         base_tempo = self.rng.uniform(*self.base_tempo_hz_range)
-        num_layers = int(
-            self.rng.integers(self.num_layers_range[0], self.num_layers_range[1] + 1)
-        )
         layers = []
         for _ in range(num_layers):
-            subdivision = self.subdivisions[
-                int(self.rng.integers(0, len(self.subdivisions)))
-            ]
             rhythm_freq = base_tempo * subdivision
             trigger = Metro(time=1.0 / rhythm_freq).play()
@@ -71,7 +64,7 @@ class StochasticRhythmAudioGenerator(AbstractTimeSeriesGenerator):
         return Mix(layers, voices=1)
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

 import numpy as np
 from pyo import Metro, Mix, Sine, TrigExpseg
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.audio_generators.utils import (
     normalize_waveform,
         decay_range: tuple[float, float],
         tone_freq_range: tuple[float, float],
         tone_mul_range: tuple[float, float],
+        random_seed: int | None = None,
     ):
         self.length = length
         self.server_duration = server_duration
     def _build_synth(self):
         base_tempo = self.rng.uniform(*self.base_tempo_hz_range)
+        num_layers = int(self.rng.integers(self.num_layers_range[0], self.num_layers_range[1] + 1))
         layers = []
         for _ in range(num_layers):
+            subdivision = self.subdivisions[int(self.rng.integers(0, len(self.subdivisions)))]
             rhythm_freq = base_tempo * subdivision
             trigger = Metro(time=1.0 / rhythm_freq).play()
         return Mix(layers, voices=1)
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         if random_seed is not None:
             self.rng = np.random.default_rng(random_seed)

src/synthetic_generation/audio_generators/stochastic_rhythm_wrapper.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.stochastic_rhythm_generator import (
@@ -15,7 +14,7 @@ class StochasticRhythmAudioWrapper(GeneratorWrapper):
         super().__init__(params)
         self.params: StochasticRhythmAudioParams = params
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
@@ -37,8 +36,8 @@ class StochasticRhythmAudioWrapper(GeneratorWrapper):
     def generate_batch(
         self,
         batch_size: int,
-        seed: Optional[int] = None,
-        params: Optional[Dict[str, Any]] = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

+from typing import Any
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.audio_generators.stochastic_rhythm_generator import (
         super().__init__(params)
         self.params: StochasticRhythmAudioParams = params
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         params.update(
             {
     def generate_batch(
         self,
         batch_size: int,
+        seed: int | None = None,
+        params: dict[str, Any] | None = None,
     ) -> TimeSeriesContainer:
         if seed is not None:
             self._set_random_seeds(seed)

src/synthetic_generation/audio_generators/utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 import tempfile
 import time
 from contextlib import redirect_stderr, redirect_stdout
-from typing import Callable
 import numpy as np
 from pyo import NewTable, Server, TableRec

 import os
 import tempfile
 import time
+from collections.abc import Callable
 from contextlib import redirect_stderr, redirect_stdout
 import numpy as np
 from pyo import NewTable, Server, TableRec

src/synthetic_generation/augmentations/offline_per_sample_iid_augmentations.py CHANGED Viewed

@@ -3,14 +3,13 @@ import logging
 import sys
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.feather as feather
 import torch
 from src.data.augmentations import (
     CensorAugmenter,
     DifferentialAugmenter,
@@ -81,17 +80,13 @@ class TimeSeriesDatasetManager:
                     last_batch_table = feather.read_table(last_batch_file)
                     if len(last_batch_table) < self.batch_size:
                         self.batch_counter = max_batch_num
-                        logging.info(
-                            f"Found incomplete last batch {max_batch_num} with {len(last_batch_table)} series"
-                        )
                 except Exception as e:
                     logging.warning(f"Error checking last batch: {e}")
-        logging.info(
-            f"Resuming from: batch_counter={self.batch_counter}, series_counter={self.series_counter}"
-        )
-    def append_batch(self, batch_data: List[Dict[str, Any]]) -> None:
         if not batch_data:
             return
@@ -101,11 +96,7 @@ class TimeSeriesDatasetManager:
                 field_name = field.name
                 if field_name in ["start", "generation_timestamp"]:
                     timestamps = [row[field_name] for row in batch_data]
-                    arrays.append(
-                        pa.array(
-                            [ts.value for ts in timestamps], type=pa.timestamp("ns")
-                        )
-                    )
                 else:
                     arrays.append(pa.array([row[field_name] for row in batch_data]))
@@ -125,8 +116,8 @@ class TimeSeriesDatasetManager:
 class UnivariateOfflineAugmentor:
     def __init__(
         self,
-        augmentations: Optional[Dict[str, bool]] = None,
-        augmentation_probabilities: Optional[Dict[str, float]] = None,
         global_seed: int = 42,
     ):
         self.global_seed = global_seed
@@ -145,9 +136,7 @@ class UnivariateOfflineAugmentor:
         self.yflip_augmenter = None
         if self.augmentations["yflip_augmentation"]:
-            self.yflip_augmenter = YFlipAugmenter(
-                p_flip=self.augmentation_probabilities["yflip_augmentation"]
-            )
         self.censor_augmenter = None
         if self.augmentations["censor_augmentation"]:
@@ -156,9 +145,7 @@ class UnivariateOfflineAugmentor:
         self.quantization_augmenter = None
         if self.augmentations["quantization_augmentation"]:
             self.quantization_augmenter = QuantizationAugmenter(
-                p_quantize=self.augmentation_probabilities[
-                    "censor_or_quantization_augmentation"
-                ],
                 level_range=(5, 15),
             )
@@ -170,8 +157,8 @@ class UnivariateOfflineAugmentor:
     def apply(
         self,
         history_values: torch.Tensor,
-        starts: Optional[List[pd.Timestamp]] = None,
-        frequencies: Optional[List[str]] = None,
     ) -> torch.Tensor:
         if not self.apply_augmentations:
             return history_values
@@ -179,10 +166,7 @@ class UnivariateOfflineAugmentor:
         batch_size = int(history_values.shape[0])
         # 0) Combination (MixUp) – handled early at batch level due to dependency on other series
-        if (
-            self.augmentations.get("mixup_augmentation", False)
-            and self.mixup_augmenter is not None
-        ):
             history_values = self.mixup_augmenter.transform(history_values)
         # Per-series plan: sample categories and apply in fixed order per series
@@ -245,9 +229,7 @@ class UnivariateOfflineAugmentor:
             num_ops = min(num_ops, len(candidates))
             probs = np.array([weights[c] for c in candidates], dtype=float)
             probs = probs / probs.sum()
-            chosen_categories = list(
-                self.rng.choice(candidates, size=num_ops, replace=False, p=probs)
-            )
             # Apply in the fixed global order, only if selected
             # 1) Invariances
@@ -291,23 +273,15 @@ class UnivariateOfflineAugmentor:
                     if pick == "calendar":
                         series = self._apply_calendar_injections(
                             series,
-                            [starts[b]]
-                            if (starts is not None and b < len(starts))
-                            else None,
-                            [frequencies[b]]
-                            if (frequencies is not None and b < len(frequencies))
-                            else None,
                             p_apply=1.0,
                         )
                     else:
-                        series = self._apply_seasonality_amplitude_modulation(
-                            series, p_apply=1.0
-                        )
             # 4) Sampling artifacts
-            if "artifacts" in chosen_categories and self.augmentations.get(
-                "resample_artifacts_augmentation", False
-            ):
                 series = self._apply_resample_artifacts(series, p_apply=1.0)
             # 5) Analytic transforms
@@ -324,10 +298,7 @@ class UnivariateOfflineAugmentor:
                     self.augmentations.get("quantization_augmentation", False)
                     and self.quantization_augmenter is not None
                 )
-                can_cens = (
-                    self.augmentations.get("censor_augmentation", False)
-                    and self.censor_augmenter is not None
-                )
                 if can_quant and can_cens:
                     method = self.rng.choice(["quantize", "censor"], p=[0.6, 0.4])
                     if method == "quantize":
@@ -344,16 +315,12 @@ class UnivariateOfflineAugmentor:
         # 7) Scaling then Noise (last, optional, batch-level)
         if self.augmentations.get("scaling_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "scaling_augmentation", 0.0
-            ):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 history_values = history_values * scale_factor
         if self.augmentations.get("noise_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "noise_augmentation", 0.0
-            ):
                 noise_std = 0.01 * torch.std(history_values)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=history_values.shape)
@@ -364,8 +331,8 @@ class UnivariateOfflineAugmentor:
     def apply_per_series_only(
         self,
         series: torch.Tensor,
-        start: Optional[pd.Timestamp] = None,
-        frequency: Optional[str] = None,
     ) -> torch.Tensor:
         """
         Apply all per-series augmentations (excluding mixup) to a single series tensor,
@@ -429,9 +396,7 @@ class UnivariateOfflineAugmentor:
             num_ops = min(num_ops, len(candidates))
             probs = np.array([weights[c] for c in candidates], dtype=float)
             probs = probs / probs.sum()
-            chosen_categories = list(
-                self.rng.choice(candidates, size=num_ops, replace=False, p=probs)
-            )
             result = series.clone()
@@ -480,14 +445,10 @@ class UnivariateOfflineAugmentor:
                             p_apply=1.0,
                         )
                     else:
-                        result = self._apply_seasonality_amplitude_modulation(
-                            result, p_apply=1.0
-                        )
             # 4) Sampling artifacts
-            if "artifacts" in chosen_categories and self.augmentations.get(
-                "resample_artifacts_augmentation", False
-            ):
                 result = self._apply_resample_artifacts(result, p_apply=1.0)
             # 5) Analytic transforms
@@ -504,10 +465,7 @@ class UnivariateOfflineAugmentor:
                     self.augmentations.get("quantization_augmentation", False)
                     and self.quantization_augmenter is not None
                 )
-                can_cens = (
-                    self.augmentations.get("censor_augmentation", False)
-                    and self.censor_augmenter is not None
-                )
                 if can_quant and can_cens:
                     method = self.rng.choice(["quantize", "censor"], p=[0.6, 0.4])
                     if method == "quantize":
@@ -521,16 +479,12 @@ class UnivariateOfflineAugmentor:
         # Optional scaling and noise (applied to this single series)
         if self.augmentations.get("scaling_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "scaling_augmentation", 0.0
-            ):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 result = result * scale_factor
         if self.augmentations.get("noise_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "noise_augmentation", 0.0
-            ):
                 noise_std = 0.01 * torch.std(result)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=result.shape)
@@ -539,20 +493,16 @@ class UnivariateOfflineAugmentor:
         return result
     @property
-    def mixup_augmenter(self) -> Optional[MixUpAugmenter]:
         if not hasattr(self, "_mixup_augmenter"):
             self._mixup_augmenter = (
-                MixUpAugmenter(
-                    p_combine=self.augmentation_probabilities["mixup_augmentation"]
-                )
                 if self.augmentations["mixup_augmentation"]
                 else None
             )
         return self._mixup_augmenter
-    def _apply_regime_change(
-        self, series: torch.Tensor, p_apply: float
-    ) -> torch.Tensor:
         """
         Apply piecewise affine transforms with 1-3 change-points per series.
         series shape: [batch, length, 1]
@@ -601,15 +551,11 @@ class UnivariateOfflineAugmentor:
                 segment = series_b[s:e]
                 # preserve segment mean roughly while scaling deviations
                 seg_mean = torch.mean(segment)
-                transformed = (
-                    (segment - seg_mean) * seg_scales[i] + seg_mean + seg_shifts[i]
-                )
                 result[b, s:e, 0] = transformed
         return result
-    def _apply_shock_recovery(
-        self, series: torch.Tensor, p_apply: float
-    ) -> torch.Tensor:
         """
         Add an impulse at a random time and exponentially decay to baseline.
         series shape: [batch, length, 1]
@@ -626,11 +572,7 @@ class UnivariateOfflineAugmentor:
             if self.rng.random() >= p_apply:
                 continue
             # choose shock time away from edges
-            t0 = int(
-                self.rng.integers(
-                    low=max(1, length // 16), high=max(2, length - length // 16)
-                )
-            )
             # magnitude relative to series std
             s_b = result[b, :, 0]
             std_b = torch.std(s_b).item()
@@ -649,8 +591,8 @@ class UnivariateOfflineAugmentor:
     def _apply_calendar_injections(
         self,
         series: torch.Tensor,
-        starts: Optional[List[pd.Timestamp]],
-        frequencies: Optional[List[str]],
         p_apply: float,
     ) -> torch.Tensor:
         if series.numel() == 0:
@@ -719,9 +661,7 @@ class UnivariateOfflineAugmentor:
             result[b, :, 0] = torch.from_numpy(s_new).to(result.device)
         return result
-    def _apply_seasonality_amplitude_modulation(
-        self, series: torch.Tensor, p_apply: float
-    ) -> torch.Tensor:
         if series.numel() == 0:
             return series
         batch_size, length, _ = series.shape
@@ -771,9 +711,7 @@ class UnivariateOfflineAugmentor:
                 continue
             ds_vals = s_np[ds_idx]
             base_idx = np.arange(length)
-            mode = self.rng.choice(
-                ["linear", "hold", "linear_smooth"], p=[0.5, 0.2, 0.3]
-            )
             if mode == "linear":
                 us = np.interp(base_idx, ds_idx, ds_vals)
             elif mode == "hold":
@@ -799,11 +737,11 @@ class OfflinePerSampleAugmentedGenerator:
         self,
         base_data_dir: str,
         output_dir: str,
-        length: Optional[int],
         chunk_size: int = 2**13,
-        generator_proportions: Optional[Dict[str, float]] = None,
-        augmentations: Optional[Dict[str, bool]] = None,
-        augmentation_probabilities: Optional[Dict[str, float]] = None,
         global_seed: int = 42,
         mixup_position: str = "both",
         change_threshold: float = 0.05,
@@ -824,14 +762,8 @@ class OfflinePerSampleAugmentedGenerator:
         self.enable_quality_filter = bool(enable_quality_filter)
         self.rc_batch_size = int(rc_batch_size)
-        out_dir_name = (
-            f"augmented_per_sample_{length}"
-            if length is not None
-            else "augmented_per_sample"
-        )
-        self.dataset_manager = TimeSeriesDatasetManager(
-            str(Path(output_dir) / out_dir_name), batch_size=chunk_size
-        )
         self.augmentor = UnivariateOfflineAugmentor(
             augmentations=augmentations,
@@ -843,7 +775,7 @@ class OfflinePerSampleAugmentedGenerator:
         self.datasets = self._initialize_datasets()
     # -------------------- Per-sample scaler utilities --------------------
-    def _choose_scaler(self) -> Optional[object]:
         """Choose a scaler with 50% probability of None; else one of four scalers uniformly."""
         if self.rng.random() < 0.5:
             return None
@@ -856,9 +788,7 @@ class OfflinePerSampleAugmentedGenerator:
             return MedianScaler()
         return MeanScaler()
-    def _apply_scaler(
-        self, values: torch.Tensor, scaler: Optional[object]
-    ) -> torch.Tensor:
         """Apply the provided scaler to values of shape [1, length, channels]."""
         if scaler is None:
             return values
@@ -866,9 +796,7 @@ class OfflinePerSampleAugmentedGenerator:
         return scaler.scale(values, stats)
     # -------------------- Mixup utilities (per-sample) --------------------
-    def _mix_sources_static(
-        self, source_tensor: torch.Tensor, alpha: float
-    ) -> torch.Tensor:
         """Static Dirichlet mix of k sources -> [1, L, C]."""
         k = int(source_tensor.shape[0])
         device = source_tensor.device
@@ -881,7 +809,7 @@ class OfflinePerSampleAugmentedGenerator:
         self,
         base_series: torch.Tensor,
         total_length_for_batch: int,
-        scaler: Optional[object],
     ) -> torch.Tensor:
         """Mix base with k-1 additional sources; returns [1, L, 1]."""
         mixup = self.augmentor.mixup_augmenter
@@ -889,11 +817,7 @@ class OfflinePerSampleAugmentedGenerator:
             return base_series
         # Decide k
-        current_k = (
-            mixup._sample_k()
-            if not mixup.randomize_k
-            else int(self.rng.integers(2, mixup.max_k + 1))
-        )
         # Ensure at least 2 and include base in the set
         current_k = max(2, int(current_k))
         num_sources_needed = current_k - 1
@@ -902,14 +826,12 @@ class OfflinePerSampleAugmentedGenerator:
         # If we sampled k gens but need only k-1 external sources, trim
         chosen_gens = chosen_gens[:num_sources_needed]
-        sources: List[torch.Tensor] = []
         # Base (already possibly scaled) first
         sources.append(base_series)
         # Additional sources
         for gen in chosen_gens:
-            src_values, _, _, _ = self._get_one_sample_from_generator(
-                gen, total_length_for_batch
-            )
             if scaler is not None:
                 src_values = self._apply_scaler(src_values, scaler)
             sources.append(src_values)
@@ -924,27 +846,23 @@ class OfflinePerSampleAugmentedGenerator:
         self,
         base_series: torch.Tensor,
         total_length_for_batch: int,
-        scaler: Optional[object],
     ) -> torch.Tensor:
         """Apply RandomConvAugmenter by creating a small temp batch and taking the transformed base element."""
         if not hasattr(self, "random_conv_augmenter"):
             # Lazy init if not present but enabled in config
             if self.augmentor.augmentations.get("random_conv_augmentation", False):
-                p_val = self.augmentor.augmentation_probabilities.get(
-                    "random_conv_augmentation", 0.3
-                )
                 self.random_conv_augmenter = RandomConvAugmenter(p_transform=p_val)
             else:
                 return base_series
         # Assemble temp batch: base + (rc_batch_size-1) sources
-        temp_series_list: List[torch.Tensor] = [base_series]
         for _ in range(max(0, self.rc_batch_size - 1)):
             try:
                 gen = self._sample_generator_name()
-                src_values, _, _, _ = self._get_one_sample_from_generator(
-                    gen, total_length_for_batch
-                )
                 if scaler is not None:
                     src_values = self._apply_scaler(src_values, scaler)
                 temp_series_list.append(src_values)
@@ -956,9 +874,7 @@ class OfflinePerSampleAugmentedGenerator:
         return transformed[0:1]
     # -------------------- Selection and quality helpers --------------------
-    def _compute_change_score(
-        self, original: torch.Tensor, augmented: torch.Tensor
-    ) -> float:
         """
         Computes a normalized change score between original and augmented series.
         The score is the Mean Absolute Error (MAE) normalized by a robust
@@ -983,15 +899,13 @@ class OfflinePerSampleAugmentedGenerator:
     # moved to src/synthetic_generation/augmentations/filter.py
-    def _setup_proportions(
-        self, generator_proportions: Optional[Dict[str, float]]
-    ) -> Dict[str, float]:
         # Default uniform proportions across discovered generators
         if generator_proportions is None:
             # Discover generator directories
             base = Path(self.base_data_dir)
             discovered = [p.name for p in base.iterdir() if p.is_dir()]
-            proportions = {name: 1.0 for name in discovered}
         else:
             proportions = dict(generator_proportions)
@@ -1000,17 +914,15 @@ class OfflinePerSampleAugmentedGenerator:
             raise ValueError("Total generator proportions must be positive")
         return {k: v / total for k, v in proportions.items()}
-    def _initialize_datasets(self) -> Dict[str, CyclicalBatchDataset]:
-        datasets: Dict[str, CyclicalBatchDataset] = {}
         for generator_name, proportion in self.generator_proportions.items():
             # Load batches only if the generator is explicitly listed and has positive proportion
             if proportion <= 0:
                 continue
             batches_dir = Path(self.base_data_dir) / generator_name
             if not batches_dir.is_dir():
-                logging.warning(
-                    f"Skipping '{generator_name}' because directory does not exist: {batches_dir}"
-                )
                 continue
             try:
                 dataset = CyclicalBatchDataset(
@@ -1028,9 +940,7 @@ class OfflinePerSampleAugmentedGenerator:
             raise ValueError("No valid datasets loaded from base_data_dir")
         return datasets
-    def _convert_sample_to_tensor(
-        self, sample: dict
-    ) -> Tuple[torch.Tensor, Any, str, int]:
         num_channels = sample.get("num_channels", 1)
         values_data = sample["values"]
@@ -1070,43 +980,33 @@ class OfflinePerSampleAugmentedGenerator:
     def _sample_generator_name(self) -> str:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
-        probs = np.array(
-            [self.generator_proportions[g] for g in available], dtype=float
-        )
         probs = probs / probs.sum()
         return str(np.random.choice(available, p=probs))
-    def _get_one_sample(
-        self, total_length_for_batch: int
-    ) -> Tuple[torch.Tensor, pd.Timestamp, str, int]:
         attempts = 0
         while attempts < 20:
             attempts += 1
             gen_name = self._sample_generator_name()
             dataset = self.datasets[gen_name]
             sample = dataset.get_samples(1)[0]
-            values, start, freq_str, num_channels = self._convert_sample_to_tensor(
-                sample
-            )
             values = self._maybe_resize(values, total_length_for_batch)
             if values.shape[2] != 1:
                 continue
             return values, start, freq_str, num_channels
-        raise RuntimeError(
-            "Failed to sample a valid univariate series after multiple attempts"
-        )
     def _get_one_sample_from_generator(
         self, gen_name: str, total_length_for_batch: int
-    ) -> Tuple[torch.Tensor, pd.Timestamp, str, int]:
         attempts = 0
         dataset = self.datasets[gen_name]
         while attempts < 20:
             attempts += 1
             sample = dataset.get_samples(1)[0]
-            values, start, freq_str, num_channels = self._convert_sample_to_tensor(
-                sample
-            )
             values = self._maybe_resize(values, total_length_for_batch)
             if values.shape[2] != 1:
                 continue
@@ -1115,18 +1015,16 @@ class OfflinePerSampleAugmentedGenerator:
             f"Failed to sample a valid univariate series from generator '{gen_name}' after multiple attempts"
         )
-    def _choose_generators_for_mixup(self, k: int) -> List[str]:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
         if not available:
             raise RuntimeError("No available generators to sample from for mixup")
         k_eff = min(k, len(available))
         # Weighted sampling without replacement by sequential renormalization
-        chosen: List[str] = []
         remaining = available.copy()
         while len(chosen) < k_eff:
-            weights = np.array(
-                [self.generator_proportions[g] for g in remaining], dtype=float
-            )
             if weights.sum() <= 0:
                 # fallback to uniform
                 probs = np.ones(len(remaining)) / len(remaining)
@@ -1137,14 +1035,10 @@ class OfflinePerSampleAugmentedGenerator:
             remaining.remove(pick)
         return chosen
-    def _maybe_apply_mixup_to_single(
-        self, base_series: torch.Tensor, total_length_for_batch: int
-    ) -> torch.Tensor:
-        do_mixup = (
-            self.augmentor.augmentations.get("mixup_augmentation", False)
-            and self.augmentor.rng.random()
-            < self.augmentor.augmentation_probabilities.get("mixup_augmentation", 0.0)
-        )
         if not do_mixup:
             return base_series
@@ -1154,21 +1048,15 @@ class OfflinePerSampleAugmentedGenerator:
             return base_series
         # Decide number of sources k consistent with MixUpAugmenter behavior
-        current_k = (
-            mixup._sample_k()
-            if not mixup.randomize_k
-            else int(self.augmentor.rng.integers(2, mixup.max_k + 1))
-        )
         # Choose distinct generators for sources according to proportions
         chosen_gens = self._choose_generators_for_mixup(current_k)
         # Collect one source per chosen generator
-        sources: List[torch.Tensor] = []
         for gen in chosen_gens:
-            src_values, _, _, _ = self._get_one_sample_from_generator(
-                gen, total_length_for_batch
-            )
             sources.append(src_values)
         source_tensor = torch.cat(sources, dim=0)
@@ -1177,15 +1065,13 @@ class OfflinePerSampleAugmentedGenerator:
         mixed_series = mixup.mix_sources(source_tensor, alpha=alpha)
         return mixed_series
-    def _tensor_to_values_list(
-        self, series_tensor: torch.Tensor
-    ) -> Tuple[List[List[float]], int, int]:
         # series_tensor shape: [1, seq_len, num_channels]
         seq_len = int(series_tensor.shape[1])
         num_channels = int(series_tensor.shape[2])
         if num_channels == 1:
             return [series_tensor.squeeze(0).squeeze(-1).tolist()], seq_len, 1
-        channels: List[List[float]] = []
         for ch in range(num_channels):
             channels.append(series_tensor[0, :, ch].tolist())
         return channels, seq_len, num_channels
@@ -1195,7 +1081,7 @@ class OfflinePerSampleAugmentedGenerator:
             f"Starting offline augmentation into {self.dataset_manager.batches_dir} | chunk_size={self.chunk_size}"
         )
-        augmented_buffer: List[Dict[str, Any]] = []
         target_batches = num_batches
         start_time = time.time()
@@ -1203,16 +1089,12 @@ class OfflinePerSampleAugmentedGenerator:
             while self.dataset_manager.batch_counter < target_batches:
                 # Decide target length for this sample
                 total_length_for_batch = (
-                    self.length
-                    if self.length is not None
-                    else int(np.random.choice(LENGTH_CHOICES))
                 )
                 for _ in range(max(1, self.max_tries)):
                     # Sample one base series
-                    base_values, base_start, base_freq, _ = self._get_one_sample(
-                        total_length_for_batch
-                    )
                     original_base = base_values.clone()
                     # Per-sample scaler choice (50% none; else robust/minmax/median/mean)
@@ -1224,9 +1106,7 @@ class OfflinePerSampleAugmentedGenerator:
                         self.augmentor.augmentations.get("mixup_augmentation", False)
                         and self.mixup_position in ["first", "both"]
                         and self.augmentor.rng.random()
-                        < self.augmentor.augmentation_probabilities.get(
-                            "mixup_augmentation", 0.0
-                        )
                     )
                     if do_mixup_early:
                         base_values = self._apply_mixup_to_series(
@@ -1239,14 +1119,9 @@ class OfflinePerSampleAugmentedGenerator:
                     )
                     # Optional analytic: RandomConvAugmenter via temp batch (before late mixup)
-                    if self.augmentor.augmentations.get(
-                        "random_conv_augmentation", False
-                    ):
-                        if (
-                            self.rng.random()
-                            < self.augmentor.augmentation_probabilities.get(
-                                "random_conv_augmentation", 0.3
-                            )
                         ):
                             augmented_single = self._apply_random_conv_with_temp_batch(
                                 augmented_single,
@@ -1259,9 +1134,7 @@ class OfflinePerSampleAugmentedGenerator:
                         self.augmentor.augmentations.get("mixup_augmentation", False)
                         and self.mixup_position in ["last", "both"]
                         and self.augmentor.rng.random()
-                        < self.augmentor.augmentation_probabilities.get(
-                            "mixup_augmentation", 0.0
-                        )
                     )
                     if do_mixup_late:
                         augmented_single = self._apply_mixup_to_series(
@@ -1278,9 +1151,7 @@ class OfflinePerSampleAugmentedGenerator:
                         continue
                     # Accept first candidate that passes thresholds
-                    values_list, seq_len, num_channels = self._tensor_to_values_list(
-                        augmented_single
-                    )
                     record = {
                         "series_id": self.dataset_manager.series_counter,
                         "values": values_list,
@@ -1300,19 +1171,19 @@ class OfflinePerSampleAugmentedGenerator:
                     self.dataset_manager.append_batch(augmented_buffer)
                     write_time = time.time() - write_start
                     elapsed = time.time() - start_time
-                    series_per_sec = (
-                        self.dataset_manager.series_counter / elapsed
-                        if elapsed > 0
-                        else 0
-                    )
                     print(
-                        f"✓ Wrote batch {self.dataset_manager.batch_counter - 1}/{target_batches} | Series: {self.dataset_manager.series_counter:,} | Rate: {series_per_sec:.1f}/s | Write: {write_time:.2f}s"
                     )
                     augmented_buffer = []
         except KeyboardInterrupt:
             logging.info(
-                f"Interrupted. Generated {self.dataset_manager.series_counter} series, {self.dataset_manager.batch_counter} batches."
             )
         finally:
             # Flush remaining buffer if any
@@ -1398,9 +1269,7 @@ def main():
         help="Temporary batch size used for RandomConvAugmenter",
     )
     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
-    parser.add_argument(
-        "--global-seed", type=int, default=42, help="Global random seed"
-    )
     args = parser.parse_args()
     setup_logging(args.verbose)

 import sys
 import time
 from pathlib import Path
+from typing import Any
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.feather as feather
 import torch
 from src.data.augmentations import (
     CensorAugmenter,
     DifferentialAugmenter,
                     last_batch_table = feather.read_table(last_batch_file)
                     if len(last_batch_table) < self.batch_size:
                         self.batch_counter = max_batch_num
+                        logging.info(f"Found incomplete last batch {max_batch_num} with {len(last_batch_table)} series")
                 except Exception as e:
                     logging.warning(f"Error checking last batch: {e}")
+        logging.info(f"Resuming from: batch_counter={self.batch_counter}, series_counter={self.series_counter}")
+    def append_batch(self, batch_data: list[dict[str, Any]]) -> None:
         if not batch_data:
             return
                 field_name = field.name
                 if field_name in ["start", "generation_timestamp"]:
                     timestamps = [row[field_name] for row in batch_data]
+                    arrays.append(pa.array([ts.value for ts in timestamps], type=pa.timestamp("ns")))
                 else:
                     arrays.append(pa.array([row[field_name] for row in batch_data]))
 class UnivariateOfflineAugmentor:
     def __init__(
         self,
+        augmentations: dict[str, bool] | None = None,
+        augmentation_probabilities: dict[str, float] | None = None,
         global_seed: int = 42,
     ):
         self.global_seed = global_seed
         self.yflip_augmenter = None
         if self.augmentations["yflip_augmentation"]:
+            self.yflip_augmenter = YFlipAugmenter(p_flip=self.augmentation_probabilities["yflip_augmentation"])
         self.censor_augmenter = None
         if self.augmentations["censor_augmentation"]:
         self.quantization_augmenter = None
         if self.augmentations["quantization_augmentation"]:
             self.quantization_augmenter = QuantizationAugmenter(
+                p_quantize=self.augmentation_probabilities["censor_or_quantization_augmentation"],
                 level_range=(5, 15),
             )
     def apply(
         self,
         history_values: torch.Tensor,
+        starts: list[pd.Timestamp] | None = None,
+        frequencies: list[str] | None = None,
     ) -> torch.Tensor:
         if not self.apply_augmentations:
             return history_values
         batch_size = int(history_values.shape[0])
         # 0) Combination (MixUp) – handled early at batch level due to dependency on other series
+        if self.augmentations.get("mixup_augmentation", False) and self.mixup_augmenter is not None:
             history_values = self.mixup_augmenter.transform(history_values)
         # Per-series plan: sample categories and apply in fixed order per series
             num_ops = min(num_ops, len(candidates))
             probs = np.array([weights[c] for c in candidates], dtype=float)
             probs = probs / probs.sum()
+            chosen_categories = list(self.rng.choice(candidates, size=num_ops, replace=False, p=probs))
             # Apply in the fixed global order, only if selected
             # 1) Invariances
                     if pick == "calendar":
                         series = self._apply_calendar_injections(
                             series,
+                            [starts[b]] if (starts is not None and b < len(starts)) else None,
+                            [frequencies[b]] if (frequencies is not None and b < len(frequencies)) else None,
                             p_apply=1.0,
                         )
                     else:
+                        series = self._apply_seasonality_amplitude_modulation(series, p_apply=1.0)
             # 4) Sampling artifacts
+            if "artifacts" in chosen_categories and self.augmentations.get("resample_artifacts_augmentation", False):
                 series = self._apply_resample_artifacts(series, p_apply=1.0)
             # 5) Analytic transforms
                     self.augmentations.get("quantization_augmentation", False)
                     and self.quantization_augmenter is not None
                 )
+                can_cens = self.augmentations.get("censor_augmentation", False) and self.censor_augmenter is not None
                 if can_quant and can_cens:
                     method = self.rng.choice(["quantize", "censor"], p=[0.6, 0.4])
                     if method == "quantize":
         # 7) Scaling then Noise (last, optional, batch-level)
         if self.augmentations.get("scaling_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("scaling_augmentation", 0.0):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 history_values = history_values * scale_factor
         if self.augmentations.get("noise_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("noise_augmentation", 0.0):
                 noise_std = 0.01 * torch.std(history_values)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=history_values.shape)
     def apply_per_series_only(
         self,
         series: torch.Tensor,
+        start: pd.Timestamp | None = None,
+        frequency: str | None = None,
     ) -> torch.Tensor:
         """
         Apply all per-series augmentations (excluding mixup) to a single series tensor,
             num_ops = min(num_ops, len(candidates))
             probs = np.array([weights[c] for c in candidates], dtype=float)
             probs = probs / probs.sum()
+            chosen_categories = list(self.rng.choice(candidates, size=num_ops, replace=False, p=probs))
             result = series.clone()
                             p_apply=1.0,
                         )
                     else:
+                        result = self._apply_seasonality_amplitude_modulation(result, p_apply=1.0)
             # 4) Sampling artifacts
+            if "artifacts" in chosen_categories and self.augmentations.get("resample_artifacts_augmentation", False):
                 result = self._apply_resample_artifacts(result, p_apply=1.0)
             # 5) Analytic transforms
                     self.augmentations.get("quantization_augmentation", False)
                     and self.quantization_augmenter is not None
                 )
+                can_cens = self.augmentations.get("censor_augmentation", False) and self.censor_augmenter is not None
                 if can_quant and can_cens:
                     method = self.rng.choice(["quantize", "censor"], p=[0.6, 0.4])
                     if method == "quantize":
         # Optional scaling and noise (applied to this single series)
         if self.augmentations.get("scaling_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("scaling_augmentation", 0.0):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 result = result * scale_factor
         if self.augmentations.get("noise_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("noise_augmentation", 0.0):
                 noise_std = 0.01 * torch.std(result)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=result.shape)
         return result
     @property
+    def mixup_augmenter(self) -> MixUpAugmenter | None:
         if not hasattr(self, "_mixup_augmenter"):
             self._mixup_augmenter = (
+                MixUpAugmenter(p_combine=self.augmentation_probabilities["mixup_augmentation"])
                 if self.augmentations["mixup_augmentation"]
                 else None
             )
         return self._mixup_augmenter
+    def _apply_regime_change(self, series: torch.Tensor, p_apply: float) -> torch.Tensor:
         """
         Apply piecewise affine transforms with 1-3 change-points per series.
         series shape: [batch, length, 1]
                 segment = series_b[s:e]
                 # preserve segment mean roughly while scaling deviations
                 seg_mean = torch.mean(segment)
+                transformed = (segment - seg_mean) * seg_scales[i] + seg_mean + seg_shifts[i]
                 result[b, s:e, 0] = transformed
         return result
+    def _apply_shock_recovery(self, series: torch.Tensor, p_apply: float) -> torch.Tensor:
         """
         Add an impulse at a random time and exponentially decay to baseline.
         series shape: [batch, length, 1]
             if self.rng.random() >= p_apply:
                 continue
             # choose shock time away from edges
+            t0 = int(self.rng.integers(low=max(1, length // 16), high=max(2, length - length // 16)))
             # magnitude relative to series std
             s_b = result[b, :, 0]
             std_b = torch.std(s_b).item()
     def _apply_calendar_injections(
         self,
         series: torch.Tensor,
+        starts: list[pd.Timestamp] | None,
+        frequencies: list[str] | None,
         p_apply: float,
     ) -> torch.Tensor:
         if series.numel() == 0:
             result[b, :, 0] = torch.from_numpy(s_new).to(result.device)
         return result
+    def _apply_seasonality_amplitude_modulation(self, series: torch.Tensor, p_apply: float) -> torch.Tensor:
         if series.numel() == 0:
             return series
         batch_size, length, _ = series.shape
                 continue
             ds_vals = s_np[ds_idx]
             base_idx = np.arange(length)
+            mode = self.rng.choice(["linear", "hold", "linear_smooth"], p=[0.5, 0.2, 0.3])
             if mode == "linear":
                 us = np.interp(base_idx, ds_idx, ds_vals)
             elif mode == "hold":
         self,
         base_data_dir: str,
         output_dir: str,
+        length: int | None,
         chunk_size: int = 2**13,
+        generator_proportions: dict[str, float] | None = None,
+        augmentations: dict[str, bool] | None = None,
+        augmentation_probabilities: dict[str, float] | None = None,
         global_seed: int = 42,
         mixup_position: str = "both",
         change_threshold: float = 0.05,
         self.enable_quality_filter = bool(enable_quality_filter)
         self.rc_batch_size = int(rc_batch_size)
+        out_dir_name = f"augmented_per_sample_{length}" if length is not None else "augmented_per_sample"
+        self.dataset_manager = TimeSeriesDatasetManager(str(Path(output_dir) / out_dir_name), batch_size=chunk_size)
         self.augmentor = UnivariateOfflineAugmentor(
             augmentations=augmentations,
         self.datasets = self._initialize_datasets()
     # -------------------- Per-sample scaler utilities --------------------
+    def _choose_scaler(self) -> object | None:
         """Choose a scaler with 50% probability of None; else one of four scalers uniformly."""
         if self.rng.random() < 0.5:
             return None
             return MedianScaler()
         return MeanScaler()
+    def _apply_scaler(self, values: torch.Tensor, scaler: object | None) -> torch.Tensor:
         """Apply the provided scaler to values of shape [1, length, channels]."""
         if scaler is None:
             return values
         return scaler.scale(values, stats)
     # -------------------- Mixup utilities (per-sample) --------------------
+    def _mix_sources_static(self, source_tensor: torch.Tensor, alpha: float) -> torch.Tensor:
         """Static Dirichlet mix of k sources -> [1, L, C]."""
         k = int(source_tensor.shape[0])
         device = source_tensor.device
         self,
         base_series: torch.Tensor,
         total_length_for_batch: int,
+        scaler: object | None,
     ) -> torch.Tensor:
         """Mix base with k-1 additional sources; returns [1, L, 1]."""
         mixup = self.augmentor.mixup_augmenter
             return base_series
         # Decide k
+        current_k = mixup._sample_k() if not mixup.randomize_k else int(self.rng.integers(2, mixup.max_k + 1))
         # Ensure at least 2 and include base in the set
         current_k = max(2, int(current_k))
         num_sources_needed = current_k - 1
         # If we sampled k gens but need only k-1 external sources, trim
         chosen_gens = chosen_gens[:num_sources_needed]
+        sources: list[torch.Tensor] = []
         # Base (already possibly scaled) first
         sources.append(base_series)
         # Additional sources
         for gen in chosen_gens:
+            src_values, _, _, _ = self._get_one_sample_from_generator(gen, total_length_for_batch)
             if scaler is not None:
                 src_values = self._apply_scaler(src_values, scaler)
             sources.append(src_values)
         self,
         base_series: torch.Tensor,
         total_length_for_batch: int,
+        scaler: object | None,
     ) -> torch.Tensor:
         """Apply RandomConvAugmenter by creating a small temp batch and taking the transformed base element."""
         if not hasattr(self, "random_conv_augmenter"):
             # Lazy init if not present but enabled in config
             if self.augmentor.augmentations.get("random_conv_augmentation", False):
+                p_val = self.augmentor.augmentation_probabilities.get("random_conv_augmentation", 0.3)
                 self.random_conv_augmenter = RandomConvAugmenter(p_transform=p_val)
             else:
                 return base_series
         # Assemble temp batch: base + (rc_batch_size-1) sources
+        temp_series_list: list[torch.Tensor] = [base_series]
         for _ in range(max(0, self.rc_batch_size - 1)):
             try:
                 gen = self._sample_generator_name()
+                src_values, _, _, _ = self._get_one_sample_from_generator(gen, total_length_for_batch)
                 if scaler is not None:
                     src_values = self._apply_scaler(src_values, scaler)
                 temp_series_list.append(src_values)
         return transformed[0:1]
     # -------------------- Selection and quality helpers --------------------
+    def _compute_change_score(self, original: torch.Tensor, augmented: torch.Tensor) -> float:
         """
         Computes a normalized change score between original and augmented series.
         The score is the Mean Absolute Error (MAE) normalized by a robust
     # moved to src/synthetic_generation/augmentations/filter.py
+    def _setup_proportions(self, generator_proportions: dict[str, float] | None) -> dict[str, float]:
         # Default uniform proportions across discovered generators
         if generator_proportions is None:
             # Discover generator directories
             base = Path(self.base_data_dir)
             discovered = [p.name for p in base.iterdir() if p.is_dir()]
+            proportions = dict.fromkeys(discovered, 1.0)
         else:
             proportions = dict(generator_proportions)
             raise ValueError("Total generator proportions must be positive")
         return {k: v / total for k, v in proportions.items()}
+    def _initialize_datasets(self) -> dict[str, CyclicalBatchDataset]:
+        datasets: dict[str, CyclicalBatchDataset] = {}
         for generator_name, proportion in self.generator_proportions.items():
             # Load batches only if the generator is explicitly listed and has positive proportion
             if proportion <= 0:
                 continue
             batches_dir = Path(self.base_data_dir) / generator_name
             if not batches_dir.is_dir():
+                logging.warning(f"Skipping '{generator_name}' because directory does not exist: {batches_dir}")
                 continue
             try:
                 dataset = CyclicalBatchDataset(
             raise ValueError("No valid datasets loaded from base_data_dir")
         return datasets
+    def _convert_sample_to_tensor(self, sample: dict) -> tuple[torch.Tensor, Any, str, int]:
         num_channels = sample.get("num_channels", 1)
         values_data = sample["values"]
     def _sample_generator_name(self) -> str:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
+        probs = np.array([self.generator_proportions[g] for g in available], dtype=float)
         probs = probs / probs.sum()
         return str(np.random.choice(available, p=probs))
+    def _get_one_sample(self, total_length_for_batch: int) -> tuple[torch.Tensor, pd.Timestamp, str, int]:
         attempts = 0
         while attempts < 20:
             attempts += 1
             gen_name = self._sample_generator_name()
             dataset = self.datasets[gen_name]
             sample = dataset.get_samples(1)[0]
+            values, start, freq_str, num_channels = self._convert_sample_to_tensor(sample)
             values = self._maybe_resize(values, total_length_for_batch)
             if values.shape[2] != 1:
                 continue
             return values, start, freq_str, num_channels
+        raise RuntimeError("Failed to sample a valid univariate series after multiple attempts")
     def _get_one_sample_from_generator(
         self, gen_name: str, total_length_for_batch: int
+    ) -> tuple[torch.Tensor, pd.Timestamp, str, int]:
         attempts = 0
         dataset = self.datasets[gen_name]
         while attempts < 20:
             attempts += 1
             sample = dataset.get_samples(1)[0]
+            values, start, freq_str, num_channels = self._convert_sample_to_tensor(sample)
             values = self._maybe_resize(values, total_length_for_batch)
             if values.shape[2] != 1:
                 continue
             f"Failed to sample a valid univariate series from generator '{gen_name}' after multiple attempts"
         )
+    def _choose_generators_for_mixup(self, k: int) -> list[str]:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
         if not available:
             raise RuntimeError("No available generators to sample from for mixup")
         k_eff = min(k, len(available))
         # Weighted sampling without replacement by sequential renormalization
+        chosen: list[str] = []
         remaining = available.copy()
         while len(chosen) < k_eff:
+            weights = np.array([self.generator_proportions[g] for g in remaining], dtype=float)
             if weights.sum() <= 0:
                 # fallback to uniform
                 probs = np.ones(len(remaining)) / len(remaining)
             remaining.remove(pick)
         return chosen
+    def _maybe_apply_mixup_to_single(self, base_series: torch.Tensor, total_length_for_batch: int) -> torch.Tensor:
+        do_mixup = self.augmentor.augmentations.get(
+            "mixup_augmentation", False
+        ) and self.augmentor.rng.random() < self.augmentor.augmentation_probabilities.get("mixup_augmentation", 0.0)
         if not do_mixup:
             return base_series
             return base_series
         # Decide number of sources k consistent with MixUpAugmenter behavior
+        current_k = mixup._sample_k() if not mixup.randomize_k else int(self.augmentor.rng.integers(2, mixup.max_k + 1))
         # Choose distinct generators for sources according to proportions
         chosen_gens = self._choose_generators_for_mixup(current_k)
         # Collect one source per chosen generator
+        sources: list[torch.Tensor] = []
         for gen in chosen_gens:
+            src_values, _, _, _ = self._get_one_sample_from_generator(gen, total_length_for_batch)
             sources.append(src_values)
         source_tensor = torch.cat(sources, dim=0)
         mixed_series = mixup.mix_sources(source_tensor, alpha=alpha)
         return mixed_series
+    def _tensor_to_values_list(self, series_tensor: torch.Tensor) -> tuple[list[list[float]], int, int]:
         # series_tensor shape: [1, seq_len, num_channels]
         seq_len = int(series_tensor.shape[1])
         num_channels = int(series_tensor.shape[2])
         if num_channels == 1:
             return [series_tensor.squeeze(0).squeeze(-1).tolist()], seq_len, 1
+        channels: list[list[float]] = []
         for ch in range(num_channels):
             channels.append(series_tensor[0, :, ch].tolist())
         return channels, seq_len, num_channels
             f"Starting offline augmentation into {self.dataset_manager.batches_dir} | chunk_size={self.chunk_size}"
         )
+        augmented_buffer: list[dict[str, Any]] = []
         target_batches = num_batches
         start_time = time.time()
             while self.dataset_manager.batch_counter < target_batches:
                 # Decide target length for this sample
                 total_length_for_batch = (
+                    self.length if self.length is not None else int(np.random.choice(LENGTH_CHOICES))
                 )
                 for _ in range(max(1, self.max_tries)):
                     # Sample one base series
+                    base_values, base_start, base_freq, _ = self._get_one_sample(total_length_for_batch)
                     original_base = base_values.clone()
                     # Per-sample scaler choice (50% none; else robust/minmax/median/mean)
                         self.augmentor.augmentations.get("mixup_augmentation", False)
                         and self.mixup_position in ["first", "both"]
                         and self.augmentor.rng.random()
+                        < self.augmentor.augmentation_probabilities.get("mixup_augmentation", 0.0)
                     )
                     if do_mixup_early:
                         base_values = self._apply_mixup_to_series(
                     )
                     # Optional analytic: RandomConvAugmenter via temp batch (before late mixup)
+                    if self.augmentor.augmentations.get("random_conv_augmentation", False):
+                        if self.rng.random() < self.augmentor.augmentation_probabilities.get(
+                            "random_conv_augmentation", 0.3
                         ):
                             augmented_single = self._apply_random_conv_with_temp_batch(
                                 augmented_single,
                         self.augmentor.augmentations.get("mixup_augmentation", False)
                         and self.mixup_position in ["last", "both"]
                         and self.augmentor.rng.random()
+                        < self.augmentor.augmentation_probabilities.get("mixup_augmentation", 0.0)
                     )
                     if do_mixup_late:
                         augmented_single = self._apply_mixup_to_series(
                         continue
                     # Accept first candidate that passes thresholds
+                    values_list, seq_len, num_channels = self._tensor_to_values_list(augmented_single)
                     record = {
                         "series_id": self.dataset_manager.series_counter,
                         "values": values_list,
                     self.dataset_manager.append_batch(augmented_buffer)
                     write_time = time.time() - write_start
                     elapsed = time.time() - start_time
+                    series_per_sec = self.dataset_manager.series_counter / elapsed if elapsed > 0 else 0
                     print(
+                        f"✓ Wrote batch {self.dataset_manager.batch_counter - 1}/{target_batches} | "
+                        f"Series: {self.dataset_manager.series_counter:,} | "
+                        f"Rate: {series_per_sec:.1f}/s | "
+                        f"Write: {write_time:.2f}s"
                     )
                     augmented_buffer = []
         except KeyboardInterrupt:
             logging.info(
+                f"Interrupted. Generated {self.dataset_manager.series_counter} series, "
+                f"{self.dataset_manager.batch_counter} batches."
             )
         finally:
             # Flush remaining buffer if any
         help="Temporary batch size used for RandomConvAugmenter",
     )
     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+    parser.add_argument("--global-seed", type=int, default=42, help="Global random seed")
     args = parser.parse_args()
     setup_logging(args.verbose)

src/synthetic_generation/augmentations/offline_temp_batch_augmentations.py CHANGED Viewed

@@ -3,12 +3,11 @@ import logging
 import sys
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import pandas as pd
 import torch
 from src.data.augmentations import (
     CensorAugmenter,
     DifferentialAugmenter,
@@ -33,12 +32,12 @@ class OfflineTempBatchAugmentedGenerator:
         self,
         base_data_dir: str,
         output_dir: str,
-        length: Optional[int],
         mixed_batch_size: int = 10,
         chunk_size: int = 2**13,
-        generator_proportions: Optional[Dict[str, float]] = None,
-        augmentations: Optional[Dict[str, bool]] = None,
-        augmentation_probabilities: Optional[Dict[str, float]] = None,
         global_seed: int = 42,
         mixup_position: str = "both",
         selection_strategy: str = "random",
@@ -54,14 +53,8 @@ class OfflineTempBatchAugmentedGenerator:
         np.random.seed(global_seed)
         torch.manual_seed(global_seed)
-        out_dir_name = (
-            f"augmented_temp_batch_{length}"
-            if length is not None
-            else "augmented_temp_batch"
-        )
-        self.dataset_manager = TimeSeriesDatasetManager(
-            str(Path(output_dir) / out_dir_name), batch_size=chunk_size
-        )
         # Augmentation config
         self.augmentation_probabilities = augmentation_probabilities or {}
@@ -82,16 +75,12 @@ class OfflineTempBatchAugmentedGenerator:
         self.flip_augmenter = None
         if self.augmentations.get("time_flip_augmentation", False):
             self.flip_augmenter = TimeFlipAugmenter(
-                p_flip=self.augmentation_probabilities.get(
-                    "time_flip_augmentation", 0.5
-                )
             )
         self.yflip_augmenter = None
         if self.augmentations.get("yflip_augmentation", False):
-            self.yflip_augmenter = YFlipAugmenter(
-                p_flip=self.augmentation_probabilities.get("yflip_augmentation", 0.5)
-            )
         self.censor_augmenter = None
         if self.augmentations.get("censor_augmentation", False):
@@ -100,9 +89,7 @@ class OfflineTempBatchAugmentedGenerator:
         self.quantization_augmenter = None
         if self.augmentations.get("quantization_augmentation", False):
             self.quantization_augmenter = QuantizationAugmenter(
-                p_quantize=self.augmentation_probabilities.get(
-                    "censor_or_quantization_augmentation", 0.5
-                ),
                 level_range=(5, 15),
             )
@@ -115,17 +102,13 @@ class OfflineTempBatchAugmentedGenerator:
         self.differential_augmentor = None
         if self.augmentations.get("differential_augmentation", False):
             self.differential_augmentor = DifferentialAugmenter(
-                p_transform=self.augmentation_probabilities.get(
-                    "differential_augmentation", 0.5
-                )
             )
         self.random_conv_augmenter = None
         if self.augmentations.get("random_conv_augmentation", False):
             self.random_conv_augmenter = RandomConvAugmenter(
-                p_transform=self.augmentation_probabilities.get(
-                    "random_conv_augmentation", 0.3
-                )
             )
         self.generator_proportions = self._setup_proportions(generator_proportions)
@@ -138,12 +121,10 @@ class OfflineTempBatchAugmentedGenerator:
             global_seed=global_seed,
         )
-    def _compute_change_scores(
-        self, original_batch: torch.Tensor, augmented_batch: torch.Tensor
-    ) -> np.ndarray:
         # Normalized MAE vs IQR (q25-q75) per element
         bsz = augmented_batch.shape[0]
-        scores: List[float] = []
         for i in range(bsz):
             base_flat = original_batch[i].reshape(-1)
             q25 = torch.quantile(base_flat, 0.25)
@@ -154,14 +135,12 @@ class OfflineTempBatchAugmentedGenerator:
             scores.append(mae / iqr)
         return np.asarray(scores, dtype=float)
-    def _setup_proportions(
-        self, generator_proportions: Optional[Dict[str, float]]
-    ) -> Dict[str, float]:
         # Default uniform across discovered generators
         if generator_proportions is None:
             base = Path(self.base_data_dir)
             discovered = [p.name for p in base.iterdir() if p.is_dir()]
-            proportions = {name: 1.0 for name in discovered}
         else:
             proportions = dict(generator_proportions)
@@ -170,16 +149,14 @@ class OfflineTempBatchAugmentedGenerator:
             raise ValueError("Total generator proportions must be positive")
         return {k: v / total for k, v in proportions.items()}
-    def _initialize_datasets(self) -> Dict[str, CyclicalBatchDataset]:
-        datasets: Dict[str, CyclicalBatchDataset] = {}
         for generator_name, proportion in self.generator_proportions.items():
             if proportion <= 0:
                 continue
             batches_dir = Path(self.base_data_dir) / generator_name
             if not batches_dir.is_dir():
-                logging.warning(
-                    f"Skipping '{generator_name}' because directory does not exist: {batches_dir}"
-                )
                 continue
             try:
                 dataset = CyclicalBatchDataset(
@@ -199,9 +176,7 @@ class OfflineTempBatchAugmentedGenerator:
     def _sample_generator_name(self) -> str:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
-        probs = np.array(
-            [self.generator_proportions[g] for g in available], dtype=float
-        )
         probs = probs / probs.sum()
         return str(self.rng.choice(available, p=probs))
@@ -226,9 +201,7 @@ class OfflineTempBatchAugmentedGenerator:
         except Exception:
             return f"{gen_name}:rand:{self.rng.integers(0, 1 << 31)}"
-    def _convert_sample_to_tensor(
-        self, sample: dict
-    ) -> Tuple[torch.Tensor, pd.Timestamp, str, int]:
         num_channels = sample.get("num_channels", 1)
         values_data = sample["values"]
@@ -247,16 +220,10 @@ class OfflineTempBatchAugmentedGenerator:
         freq_str = sample["frequency"]
         start_val = sample["start"]
-        start = (
-            start_val
-            if isinstance(start_val, pd.Timestamp)
-            else pd.Timestamp(start_val)
-        )
         return values, start, freq_str, num_channels
-    def _shorten_like_batch_composer(
-        self, values: torch.Tensor, target_len: int
-    ) -> Optional[torch.Tensor]:
         # Only shorten if longer; if shorter than target_len, reject (to keep batch aligned)
         seq_len = int(values.shape[1])
         if seq_len == target_len:
@@ -274,9 +241,7 @@ class OfflineTempBatchAugmentedGenerator:
         return values[:, indices, :]
     def _maybe_apply_scaler(self, values: torch.Tensor) -> torch.Tensor:
-        scaler_choice = str(
-            self.rng.choice(["robust", "minmax", "median", "mean", "none"])
-        )
         scaler = None
         if scaler_choice == "robust":
             scaler = RobustScaler()
@@ -293,8 +258,8 @@ class OfflineTempBatchAugmentedGenerator:
     def _apply_augmentations(
         self,
         batch_values: torch.Tensor,
-        starts: List[pd.Timestamp],
-        freqs: List[str],
     ) -> torch.Tensor:
         if not self.apply_augmentations:
             return batch_values
@@ -314,17 +279,13 @@ class OfflineTempBatchAugmentedGenerator:
             s = batch_values[i : i + 1]
             start_i = starts[i] if i < len(starts) else None
             freq_i = freqs[i] if i < len(freqs) else None
-            s_aug = self.per_series_augmentor.apply_per_series_only(
-                s, start=start_i, frequency=freq_i
-            )
             augmented_list.append(s_aug)
         batch_values = torch.cat(augmented_list, dim=0)
         # 3) Noise augmentation (batch-level)
         if self.augmentations.get("noise_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "noise_augmentation", 0.5
-            ):
                 noise_std = 0.01 * torch.std(batch_values)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=batch_values.shape)
@@ -332,20 +293,13 @@ class OfflineTempBatchAugmentedGenerator:
         # 4) Scaling augmentation (batch-level)
         if self.augmentations.get("scaling_augmentation", False):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "scaling_augmentation", 0.5
-            ):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 batch_values = batch_values * scale_factor
         # 5) RandomConvAugmenter (batch-level)
-        if (
-            self.augmentations.get("random_conv_augmentation", False)
-            and self.random_conv_augmenter is not None
-        ):
-            if self.rng.random() < self.augmentation_probabilities.get(
-                "random_conv_augmentation", 0.3
-            ):
                 batch_values = self.random_conv_augmenter.transform(batch_values)
         # 6) Late mixup (batch-level)
@@ -360,7 +314,7 @@ class OfflineTempBatchAugmentedGenerator:
     def _get_one_source_sample(
         self, total_length_for_batch: int, used_source_keys: set
-    ) -> Optional[Tuple[torch.Tensor, pd.Timestamp, str, str]]:
         # Returns (values, start, freq, source_key) or None if cannot fetch
         attempts = 0
         while attempts < 50:
@@ -368,18 +322,14 @@ class OfflineTempBatchAugmentedGenerator:
             gen_name = self._sample_generator_name()
             dataset = self.datasets[gen_name]
             sample = dataset.get_samples(1)[0]
-            values, start, freq_str, num_channels = self._convert_sample_to_tensor(
-                sample
-            )
             if num_channels != 1:
                 continue
             # Reject NaNs
             if torch.isnan(values).any():
                 continue
             # Shorten to target_len; reject if too short
-            shortened = self._shorten_like_batch_composer(
-                values, total_length_for_batch
-            )
             if shortened is None:
                 continue
             values = shortened
@@ -394,24 +344,24 @@ class OfflineTempBatchAugmentedGenerator:
             return values, start, freq_str, key
         return None
-    def _tensor_to_values_list(
-        self, series_tensor: torch.Tensor
-    ) -> Tuple[List[List[float]], int, int]:
         seq_len = int(series_tensor.shape[1])
         num_channels = int(series_tensor.shape[2])
         if num_channels == 1:
             return [series_tensor.squeeze(0).squeeze(-1).tolist()], seq_len, 1
-        channels: List[List[float]] = []
         for ch in range(num_channels):
             channels.append(series_tensor[0, :, ch].tolist())
         return channels, seq_len, num_channels
     def run(self, num_batches: int) -> None:
         logging.info(
-            f"Starting offline IID augmentation into {self.dataset_manager.batches_dir} | chunk_size={self.chunk_size} | mixed_batch_size={self.mixed_batch_size}"
         )
-        augmented_buffer: List[Dict[str, Any]] = []
         target_batches = num_batches
         start_time = time.time()
@@ -419,28 +369,21 @@ class OfflineTempBatchAugmentedGenerator:
             while self.dataset_manager.batch_counter < target_batches:
                 # Decide target length for this temp batch
                 total_length_for_batch = (
-                    self.length
-                    if self.length is not None
-                    else int(self.rng.choice(LENGTH_CHOICES))
                 )
-                selected_record: Optional[Dict[str, Any]] = None
                 for _retry in range(max(1, self.temp_batch_retries + 1)):
                     # Collect a temporary mixed batch without reusing sources
-                    temp_values_list: List[torch.Tensor] = []
-                    temp_starts: List[pd.Timestamp] = []
-                    temp_freqs: List[str] = []
                     temp_used_keys: set = set()
                     attempts = 0
-                    while (
-                        len(temp_values_list) < self.mixed_batch_size
-                        and attempts < self.mixed_batch_size * 200
-                    ):
                         attempts += 1
-                        fetched = self._get_one_source_sample(
-                            total_length_for_batch, temp_used_keys
-                        )
                         if fetched is None:
                             continue
                         values, start, freq, _ = fetched
@@ -456,28 +399,24 @@ class OfflineTempBatchAugmentedGenerator:
                     original_temp_batch = temp_batch.clone()
                     # Apply augmentations sequentially
-                    augmented_temp_batch = self._apply_augmentations(
-                        temp_batch, temp_starts, temp_freqs
-                    )
                     # Compute change scores
-                    scores = self._compute_change_scores(
-                        original_temp_batch, augmented_temp_batch
-                    )
                     # Build eligible indices by threshold
                     eligible = np.where(scores >= self.change_threshold)[0].tolist()
                     # Apply quality filter if enabled
                     if self.enable_quality_filter:
-                        eligible_q: List[int] = []
                         for idx in eligible:
                             cand = augmented_temp_batch[idx : idx + 1]
                             if not is_low_quality(cand):
                                 eligible_q.append(idx)
                         eligible = eligible_q
-                    sel_idx: Optional[int] = None
                     if self.selection_strategy == "max_change":
                         if eligible:
                             sel_idx = int(max(eligible, key=lambda i: scores[i]))
@@ -487,35 +426,25 @@ class OfflineTempBatchAugmentedGenerator:
                                 qual_idxs = [
                                     i
                                     for i in range(augmented_temp_batch.shape[0])
-                                    if not is_low_quality(
-                                        augmented_temp_batch[i : i + 1]
-                                    )
                                 ]
                                 if qual_idxs:
-                                    sel_idx = int(
-                                        max(qual_idxs, key=lambda i: scores[i])
-                                    )
                             if sel_idx is None:
                                 sel_idx = int(np.argmax(scores))
                     else:
                         # random selection among eligible, else fallback to best
                         if eligible:
-                            sel_idx = int(
-                                self.rng.choice(np.asarray(eligible, dtype=int))
-                            )
                         else:
                             if self.enable_quality_filter:
                                 qual_idxs = [
                                     i
                                     for i in range(augmented_temp_batch.shape[0])
-                                    if not is_low_quality(
-                                        augmented_temp_batch[i : i + 1]
-                                    )
                                 ]
                                 if qual_idxs:
-                                    sel_idx = int(
-                                        max(qual_idxs, key=lambda i: scores[i])
-                                    )
                             if sel_idx is None:
                                 sel_idx = int(np.argmax(scores))
@@ -524,9 +453,7 @@ class OfflineTempBatchAugmentedGenerator:
                         continue
                     selected_series = augmented_temp_batch[sel_idx : sel_idx + 1]
-                    values_list, seq_len, num_channels = self._tensor_to_values_list(
-                        selected_series
-                    )
                     selected_record = {
                         "series_id": self.dataset_manager.series_counter,
                         "values": values_list,
@@ -550,19 +477,19 @@ class OfflineTempBatchAugmentedGenerator:
                     self.dataset_manager.append_batch(augmented_buffer)
                     write_time = time.time() - write_start
                     elapsed = time.time() - start_time
-                    series_per_sec = (
-                        self.dataset_manager.series_counter / elapsed
-                        if elapsed > 0
-                        else 0
-                    )
                     print(
-                        f"✓ Wrote batch {self.dataset_manager.batch_counter - 1}/{target_batches} | Series: {self.dataset_manager.series_counter:,} | Rate: {series_per_sec:.1f}/s | Write: {write_time:.2f}s"
                     )
                     augmented_buffer = []
         except KeyboardInterrupt:
             logging.info(
-                f"Interrupted. Generated {self.dataset_manager.series_counter} series, {self.dataset_manager.batch_counter} batches."
             )
         finally:
             if augmented_buffer:
@@ -653,9 +580,7 @@ def main():
         help="Number of times to rebuild temp batch if selection fails thresholds",
     )
     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
-    parser.add_argument(
-        "--global-seed", type=int, default=42, help="Global random seed"
-    )
     args = parser.parse_args()
     setup_logging(args.verbose)

 import sys
 import time
 from pathlib import Path
+from typing import Any
 import numpy as np
 import pandas as pd
 import torch
 from src.data.augmentations import (
     CensorAugmenter,
     DifferentialAugmenter,
         self,
         base_data_dir: str,
         output_dir: str,
+        length: int | None,
         mixed_batch_size: int = 10,
         chunk_size: int = 2**13,
+        generator_proportions: dict[str, float] | None = None,
+        augmentations: dict[str, bool] | None = None,
+        augmentation_probabilities: dict[str, float] | None = None,
         global_seed: int = 42,
         mixup_position: str = "both",
         selection_strategy: str = "random",
         np.random.seed(global_seed)
         torch.manual_seed(global_seed)
+        out_dir_name = f"augmented_temp_batch_{length}" if length is not None else "augmented_temp_batch"
+        self.dataset_manager = TimeSeriesDatasetManager(str(Path(output_dir) / out_dir_name), batch_size=chunk_size)
         # Augmentation config
         self.augmentation_probabilities = augmentation_probabilities or {}
         self.flip_augmenter = None
         if self.augmentations.get("time_flip_augmentation", False):
             self.flip_augmenter = TimeFlipAugmenter(
+                p_flip=self.augmentation_probabilities.get("time_flip_augmentation", 0.5)
             )
         self.yflip_augmenter = None
         if self.augmentations.get("yflip_augmentation", False):
+            self.yflip_augmenter = YFlipAugmenter(p_flip=self.augmentation_probabilities.get("yflip_augmentation", 0.5))
         self.censor_augmenter = None
         if self.augmentations.get("censor_augmentation", False):
         self.quantization_augmenter = None
         if self.augmentations.get("quantization_augmentation", False):
             self.quantization_augmenter = QuantizationAugmenter(
+                p_quantize=self.augmentation_probabilities.get("censor_or_quantization_augmentation", 0.5),
                 level_range=(5, 15),
             )
         self.differential_augmentor = None
         if self.augmentations.get("differential_augmentation", False):
             self.differential_augmentor = DifferentialAugmenter(
+                p_transform=self.augmentation_probabilities.get("differential_augmentation", 0.5)
             )
         self.random_conv_augmenter = None
         if self.augmentations.get("random_conv_augmentation", False):
             self.random_conv_augmenter = RandomConvAugmenter(
+                p_transform=self.augmentation_probabilities.get("random_conv_augmentation", 0.3)
             )
         self.generator_proportions = self._setup_proportions(generator_proportions)
             global_seed=global_seed,
         )
+    def _compute_change_scores(self, original_batch: torch.Tensor, augmented_batch: torch.Tensor) -> np.ndarray:
         # Normalized MAE vs IQR (q25-q75) per element
         bsz = augmented_batch.shape[0]
+        scores: list[float] = []
         for i in range(bsz):
             base_flat = original_batch[i].reshape(-1)
             q25 = torch.quantile(base_flat, 0.25)
             scores.append(mae / iqr)
         return np.asarray(scores, dtype=float)
+    def _setup_proportions(self, generator_proportions: dict[str, float] | None) -> dict[str, float]:
         # Default uniform across discovered generators
         if generator_proportions is None:
             base = Path(self.base_data_dir)
             discovered = [p.name for p in base.iterdir() if p.is_dir()]
+            proportions = dict.fromkeys(discovered, 1.0)
         else:
             proportions = dict(generator_proportions)
             raise ValueError("Total generator proportions must be positive")
         return {k: v / total for k, v in proportions.items()}
+    def _initialize_datasets(self) -> dict[str, CyclicalBatchDataset]:
+        datasets: dict[str, CyclicalBatchDataset] = {}
         for generator_name, proportion in self.generator_proportions.items():
             if proportion <= 0:
                 continue
             batches_dir = Path(self.base_data_dir) / generator_name
             if not batches_dir.is_dir():
+                logging.warning(f"Skipping '{generator_name}' because directory does not exist: {batches_dir}")
                 continue
             try:
                 dataset = CyclicalBatchDataset(
     def _sample_generator_name(self) -> str:
         available = [g for g in self.generator_proportions.keys() if g in self.datasets]
+        probs = np.array([self.generator_proportions[g] for g in available], dtype=float)
         probs = probs / probs.sum()
         return str(self.rng.choice(available, p=probs))
         except Exception:
             return f"{gen_name}:rand:{self.rng.integers(0, 1 << 31)}"
+    def _convert_sample_to_tensor(self, sample: dict) -> tuple[torch.Tensor, pd.Timestamp, str, int]:
         num_channels = sample.get("num_channels", 1)
         values_data = sample["values"]
         freq_str = sample["frequency"]
         start_val = sample["start"]
+        start = start_val if isinstance(start_val, pd.Timestamp) else pd.Timestamp(start_val)
         return values, start, freq_str, num_channels
+    def _shorten_like_batch_composer(self, values: torch.Tensor, target_len: int) -> torch.Tensor | None:
         # Only shorten if longer; if shorter than target_len, reject (to keep batch aligned)
         seq_len = int(values.shape[1])
         if seq_len == target_len:
         return values[:, indices, :]
     def _maybe_apply_scaler(self, values: torch.Tensor) -> torch.Tensor:
+        scaler_choice = str(self.rng.choice(["robust", "minmax", "median", "mean", "none"]))
         scaler = None
         if scaler_choice == "robust":
             scaler = RobustScaler()
     def _apply_augmentations(
         self,
         batch_values: torch.Tensor,
+        starts: list[pd.Timestamp],
+        freqs: list[str],
     ) -> torch.Tensor:
         if not self.apply_augmentations:
             return batch_values
             s = batch_values[i : i + 1]
             start_i = starts[i] if i < len(starts) else None
             freq_i = freqs[i] if i < len(freqs) else None
+            s_aug = self.per_series_augmentor.apply_per_series_only(s, start=start_i, frequency=freq_i)
             augmented_list.append(s_aug)
         batch_values = torch.cat(augmented_list, dim=0)
         # 3) Noise augmentation (batch-level)
         if self.augmentations.get("noise_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("noise_augmentation", 0.5):
                 noise_std = 0.01 * torch.std(batch_values)
                 if torch.isfinite(noise_std) and (noise_std > 0):
                     noise = torch.normal(0, noise_std, size=batch_values.shape)
         # 4) Scaling augmentation (batch-level)
         if self.augmentations.get("scaling_augmentation", False):
+            if self.rng.random() < self.augmentation_probabilities.get("scaling_augmentation", 0.5):
                 scale_factor = float(self.rng.uniform(0.95, 1.05))
                 batch_values = batch_values * scale_factor
         # 5) RandomConvAugmenter (batch-level)
+        if self.augmentations.get("random_conv_augmentation", False) and self.random_conv_augmenter is not None:
+            if self.rng.random() < self.augmentation_probabilities.get("random_conv_augmentation", 0.3):
                 batch_values = self.random_conv_augmenter.transform(batch_values)
         # 6) Late mixup (batch-level)
     def _get_one_source_sample(
         self, total_length_for_batch: int, used_source_keys: set
+    ) -> tuple[torch.Tensor, pd.Timestamp, str, str] | None:
         # Returns (values, start, freq, source_key) or None if cannot fetch
         attempts = 0
         while attempts < 50:
             gen_name = self._sample_generator_name()
             dataset = self.datasets[gen_name]
             sample = dataset.get_samples(1)[0]
+            values, start, freq_str, num_channels = self._convert_sample_to_tensor(sample)
             if num_channels != 1:
                 continue
             # Reject NaNs
             if torch.isnan(values).any():
                 continue
             # Shorten to target_len; reject if too short
+            shortened = self._shorten_like_batch_composer(values, total_length_for_batch)
             if shortened is None:
                 continue
             values = shortened
             return values, start, freq_str, key
         return None
+    def _tensor_to_values_list(self, series_tensor: torch.Tensor) -> tuple[list[list[float]], int, int]:
         seq_len = int(series_tensor.shape[1])
         num_channels = int(series_tensor.shape[2])
         if num_channels == 1:
             return [series_tensor.squeeze(0).squeeze(-1).tolist()], seq_len, 1
+        channels: list[list[float]] = []
         for ch in range(num_channels):
             channels.append(series_tensor[0, :, ch].tolist())
         return channels, seq_len, num_channels
     def run(self, num_batches: int) -> None:
         logging.info(
+            f"Starting offline IID augmentation into {self.dataset_manager.batches_dir} | "
+            f"chunk_size={self.chunk_size} | "
+            f"mixed_batch_size={self.mixed_batch_size}"
         )
+        augmented_buffer: list[dict[str, Any]] = []
         target_batches = num_batches
         start_time = time.time()
             while self.dataset_manager.batch_counter < target_batches:
                 # Decide target length for this temp batch
                 total_length_for_batch = (
+                    self.length if self.length is not None else int(self.rng.choice(LENGTH_CHOICES))
                 )
+                selected_record: dict[str, Any] | None = None
                 for _retry in range(max(1, self.temp_batch_retries + 1)):
                     # Collect a temporary mixed batch without reusing sources
+                    temp_values_list: list[torch.Tensor] = []
+                    temp_starts: list[pd.Timestamp] = []
+                    temp_freqs: list[str] = []
                     temp_used_keys: set = set()
                     attempts = 0
+                    while len(temp_values_list) < self.mixed_batch_size and attempts < self.mixed_batch_size * 200:
                         attempts += 1
+                        fetched = self._get_one_source_sample(total_length_for_batch, temp_used_keys)
                         if fetched is None:
                             continue
                         values, start, freq, _ = fetched
                     original_temp_batch = temp_batch.clone()
                     # Apply augmentations sequentially
+                    augmented_temp_batch = self._apply_augmentations(temp_batch, temp_starts, temp_freqs)
                     # Compute change scores
+                    scores = self._compute_change_scores(original_temp_batch, augmented_temp_batch)
                     # Build eligible indices by threshold
                     eligible = np.where(scores >= self.change_threshold)[0].tolist()
                     # Apply quality filter if enabled
                     if self.enable_quality_filter:
+                        eligible_q: list[int] = []
                         for idx in eligible:
                             cand = augmented_temp_batch[idx : idx + 1]
                             if not is_low_quality(cand):
                                 eligible_q.append(idx)
                         eligible = eligible_q
+                    sel_idx: int | None = None
                     if self.selection_strategy == "max_change":
                         if eligible:
                             sel_idx = int(max(eligible, key=lambda i: scores[i]))
                                 qual_idxs = [
                                     i
                                     for i in range(augmented_temp_batch.shape[0])
+                                    if not is_low_quality(augmented_temp_batch[i : i + 1])
                                 ]
                                 if qual_idxs:
+                                    sel_idx = int(max(qual_idxs, key=lambda i: scores[i]))
                             if sel_idx is None:
                                 sel_idx = int(np.argmax(scores))
                     else:
                         # random selection among eligible, else fallback to best
                         if eligible:
+                            sel_idx = int(self.rng.choice(np.asarray(eligible, dtype=int)))
                         else:
                             if self.enable_quality_filter:
                                 qual_idxs = [
                                     i
                                     for i in range(augmented_temp_batch.shape[0])
+                                    if not is_low_quality(augmented_temp_batch[i : i + 1])
                                 ]
                                 if qual_idxs:
+                                    sel_idx = int(max(qual_idxs, key=lambda i: scores[i]))
                             if sel_idx is None:
                                 sel_idx = int(np.argmax(scores))
                         continue
                     selected_series = augmented_temp_batch[sel_idx : sel_idx + 1]
+                    values_list, seq_len, num_channels = self._tensor_to_values_list(selected_series)
                     selected_record = {
                         "series_id": self.dataset_manager.series_counter,
                         "values": values_list,
                     self.dataset_manager.append_batch(augmented_buffer)
                     write_time = time.time() - write_start
                     elapsed = time.time() - start_time
+                    series_per_sec = self.dataset_manager.series_counter / elapsed if elapsed > 0 else 0
                     print(
+                        f"✓ Wrote batch {self.dataset_manager.batch_counter - 1}/{target_batches} | "
+                        f"Series: {self.dataset_manager.series_counter:,} | "
+                        f"Rate: {series_per_sec:.1f}/s | "
+                        f"Write: {write_time:.2f}s"
                     )
                     augmented_buffer = []
         except KeyboardInterrupt:
             logging.info(
+                f"Interrupted. Generated {self.dataset_manager.series_counter} series, "
+                f"{self.dataset_manager.batch_counter} batches."
             )
         finally:
             if augmented_buffer:
         help="Number of times to rebuild temp batch if selection fails thresholds",
     )
     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+    parser.add_argument("--global-seed", type=int, default=42, help="Global random seed")
     args = parser.parse_args()
     setup_logging(args.verbose)

src/synthetic_generation/cauker/cauker_generator.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import functools
 import random
-from typing import Dict, List, Optional, Tuple, Union
 import cupy as cp
 import networkx as nx
@@ -13,7 +12,6 @@ from sklearn.gaussian_process.kernels import (
     RationalQuadratic,
     WhiteKernel,
 )
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.generator_params import CauKerGeneratorParams
@@ -31,7 +29,7 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
     # -------------------------------------------------------------------------
     # 1. Kernel Bank Construction (parameterised by `time_length`)
     # -------------------------------------------------------------------------
-    def build_kernel_bank(self, time_length: int) -> List:
         return [
             # Hourly / sub‑hourly cycles
             ExpSineSquared(periodicity=24 / time_length),
@@ -123,9 +121,9 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
         *,
         kernel,
         X: np.ndarray,
-        random_seed: Optional[int] = None,
         method: str = "eigh",
-        mean_vec: Optional[np.ndarray] = None,
     ) -> np.ndarray:
         if X.ndim == 1:
             X = X[:, None]
@@ -141,9 +139,7 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
         if random_seed is not None:
             cp.random.seed(random_seed)
-        ts_gpu = cp.random.multivariate_normal(
-            mean=mean_gpu, cov=cov_gpu, method=method
-        )
         return cp.asnumpy(ts_gpu)
     # -------------------------------------------------------------------------
@@ -179,14 +175,12 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
         alpha = np.random.uniform(0.01, 0.3)
         return np.where(x > 0, x, alpha * x)
-    def random_edge_mapping(self, parents_data: List[np.ndarray]) -> np.ndarray:
         combined = np.stack(parents_data, axis=1)
         W = np.random.randn(len(parents_data))
         b = np.random.randn()
         non_linear_input = combined @ W + b
-        chosen_func = np.random.choice(
-            ["linear", "relu", "sigmoid", "sin", "mod", "leakyrelu"]
-        )
         return self.random_activation(non_linear_input, chosen_func)
     # -------------------------------------------------------------------------
@@ -200,7 +194,7 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
         max_parents: int,
         seed: int,
         num_nodes: int,
-    ) -> Dict[int, np.ndarray]:
         np.random.seed(seed)
         random.seed(seed)
@@ -208,15 +202,13 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
         kernel_bank = self.build_kernel_bank(time_length)
         root_nodes = [n for n in dag.nodes if dag.in_degree(n) == 0]
-        node_data: Dict[int, np.ndarray] = {}
         X = np.linspace(0.0, 1.0, time_length)
         # Sample roots directly from the GP prior
         for r in root_nodes:
-            selected_kernels = np.random.choice(
-                kernel_bank, np.random.randint(1, 8), replace=True
-            )
             kernel = functools.reduce(self.random_binary_map, selected_kernels)
             mean_vec = self.random_mean_combination(X)
             node_data[r] = self.sample_from_gp_prior_efficient_gpu(
@@ -236,12 +228,12 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
     # -------------------------------------------------------------------------
     # Public API: generate one multivariate series (length, num_channels)
     # -------------------------------------------------------------------------
-    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
         """Generate one multivariate series with shape (length, num_channels)."""
         seed = self.params.global_seed if random_seed is None else random_seed
         # Resolve num_channels which can be int or (min, max)
-        desired_channels: Union[int, Tuple[int, int]] = self.params.num_channels
         if isinstance(desired_channels, tuple):
             low, high = desired_channels
             if low > high:
@@ -251,9 +243,7 @@ class CauKerGenerator(AbstractTimeSeriesGenerator):
             num_channels = int(desired_channels)
         if num_channels > self.params.num_nodes:
-            raise ValueError(
-                f"num_channels ({num_channels}) cannot exceed num_nodes ({self.params.num_nodes})."
-            )
         node_data = self.generate_scm_time_series(
             time_length=self.params.length,

 import functools
 import random
 import cupy as cp
 import networkx as nx
     RationalQuadratic,
     WhiteKernel,
 )
 from src.synthetic_generation.abstract_classes import AbstractTimeSeriesGenerator
 from src.synthetic_generation.generator_params import CauKerGeneratorParams
     # -------------------------------------------------------------------------
     # 1. Kernel Bank Construction (parameterised by `time_length`)
     # -------------------------------------------------------------------------
+    def build_kernel_bank(self, time_length: int) -> list:
         return [
             # Hourly / sub‑hourly cycles
             ExpSineSquared(periodicity=24 / time_length),
         *,
         kernel,
         X: np.ndarray,
+        random_seed: int | None = None,
         method: str = "eigh",
+        mean_vec: np.ndarray | None = None,
     ) -> np.ndarray:
         if X.ndim == 1:
             X = X[:, None]
         if random_seed is not None:
             cp.random.seed(random_seed)
+        ts_gpu = cp.random.multivariate_normal(mean=mean_gpu, cov=cov_gpu, method=method)
         return cp.asnumpy(ts_gpu)
     # -------------------------------------------------------------------------
         alpha = np.random.uniform(0.01, 0.3)
         return np.where(x > 0, x, alpha * x)
+    def random_edge_mapping(self, parents_data: list[np.ndarray]) -> np.ndarray:
         combined = np.stack(parents_data, axis=1)
         W = np.random.randn(len(parents_data))
         b = np.random.randn()
         non_linear_input = combined @ W + b
+        chosen_func = np.random.choice(["linear", "relu", "sigmoid", "sin", "mod", "leakyrelu"])
         return self.random_activation(non_linear_input, chosen_func)
     # -------------------------------------------------------------------------
         max_parents: int,
         seed: int,
         num_nodes: int,
+    ) -> dict[int, np.ndarray]:
         np.random.seed(seed)
         random.seed(seed)
         kernel_bank = self.build_kernel_bank(time_length)
         root_nodes = [n for n in dag.nodes if dag.in_degree(n) == 0]
+        node_data: dict[int, np.ndarray] = {}
         X = np.linspace(0.0, 1.0, time_length)
         # Sample roots directly from the GP prior
         for r in root_nodes:
+            selected_kernels = np.random.choice(kernel_bank, np.random.randint(1, 8), replace=True)
             kernel = functools.reduce(self.random_binary_map, selected_kernels)
             mean_vec = self.random_mean_combination(X)
             node_data[r] = self.sample_from_gp_prior_efficient_gpu(
     # -------------------------------------------------------------------------
     # Public API: generate one multivariate series (length, num_channels)
     # -------------------------------------------------------------------------
+    def generate_time_series(self, random_seed: int | None = None) -> np.ndarray:
         """Generate one multivariate series with shape (length, num_channels)."""
         seed = self.params.global_seed if random_seed is None else random_seed
         # Resolve num_channels which can be int or (min, max)
+        desired_channels: int | tuple[int, int] = self.params.num_channels
         if isinstance(desired_channels, tuple):
             low, high = desired_channels
             if low > high:
             num_channels = int(desired_channels)
         if num_channels > self.params.num_nodes:
+            raise ValueError(f"num_channels ({num_channels}) cannot exceed num_nodes ({self.params.num_nodes}).")
         node_data = self.generate_scm_time_series(
             time_length=self.params.length,

src/synthetic_generation/cauker/cauker_generator_wrapper.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Optional
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.cauker.cauker_generator import CauKerGenerator
@@ -17,7 +16,7 @@ class CauKerGeneratorWrapper(GeneratorWrapper):
         super().__init__(params)
         self.params: CauKerGeneratorParams = params
-    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         # Resolve num_channels if range is given: sample once per batch for consistency
         desired_channels = self.params.num_channels
@@ -41,9 +40,7 @@ class CauKerGeneratorWrapper(GeneratorWrapper):
         )
         return params
-    def generate_batch(
-        self, batch_size: int, seed: Optional[int] = None
-    ) -> TimeSeriesContainer:
         # Establish a base seed to ensure different series use different seeds
         base_seed = seed if seed is not None else self.params.global_seed
         self._set_random_seeds(base_seed)

+from typing import Any
 import numpy as np
 from src.data.containers import TimeSeriesContainer
 from src.synthetic_generation.abstract_classes import GeneratorWrapper
 from src.synthetic_generation.cauker.cauker_generator import CauKerGenerator
         super().__init__(params)
         self.params: CauKerGeneratorParams = params
+    def _sample_parameters(self, batch_size: int) -> dict[str, Any]:
         params = super()._sample_parameters(batch_size)
         # Resolve num_channels if range is given: sample once per batch for consistency
         desired_channels = self.params.num_channels
         )
         return params
+    def generate_batch(self, batch_size: int, seed: int | None = None) -> TimeSeriesContainer:
         # Establish a base seed to ensure different series use different seeds
         base_seed = seed if seed is not None else self.params.global_seed
         self._set_random_seeds(base_seed)

src/synthetic_generation/continuous_generation.py CHANGED Viewed

@@ -7,7 +7,7 @@ import sys
 import tempfile
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
 import numpy as np
 import pandas as pd
@@ -41,7 +41,7 @@ from src.synthetic_generation.generator_params import (
     FinancialVolatilityAudioParams,
     ForecastPFNGeneratorParams,
     GPGeneratorParams,
-    KernelGeneratorParams,
     MultiScaleFractalAudioParams,
     NetworkTopologyAudioParams,
     OrnsteinUhlenbeckProcessGeneratorParams,
@@ -54,7 +54,7 @@ from src.synthetic_generation.generator_params import (
 from src.synthetic_generation.gp_prior.gp_generator_wrapper import GPGeneratorWrapper
 from src.synthetic_generation.kernel_synth.kernel_generator_wrapper import (
     KernelGeneratorWrapper,
-)
 from src.synthetic_generation.ornstein_uhlenbeck_process.ou_generator_wrapper import (
     OrnsteinUhlenbeckProcessGeneratorWrapper,
 )
@@ -114,7 +114,7 @@ class TimeSeriesDatasetManager:
         """Returns the total number of series found on disk at initialization."""
         return self.series_counter
-    def append_batch(self, batch_data: List[Dict[str, Any]]) -> None:
         """Appends a batch to a new file using an atomic rename for parallel safety."""
         if not batch_data:
             return
@@ -125,9 +125,7 @@ class TimeSeriesDatasetManager:
                 field_name = field.name
                 if field_name in ["start", "generation_timestamp"]:
                     timestamps = [d[field_name] for d in batch_data]
-                    arrays.append(
-                        pa.array([t.value for t in timestamps], type=pa.timestamp("ns"))
-                    )
                 else:
                     arrays.append(pa.array([d[field_name] for d in batch_data]))
             new_table = pa.Table.from_arrays(arrays, schema=self.schema)
@@ -137,36 +135,26 @@ class TimeSeriesDatasetManager:
         tmp_path = None
         try:
-            with tempfile.NamedTemporaryFile(
-                delete=False, dir=self.batches_dir, suffix=".arrow.tmp"
-            ) as tmp:
                 tmp_path = tmp.name
                 feather.write_feather(new_table, tmp_path)
             max_retries = 20
             for _ in range(max_retries):
                 existing = self.batches_dir.glob("batch_*.arrow")
-                batch_nums = [
-                    int(p.stem.split("_")[1])
-                    for p in existing
-                    if p.stem.split("_")[1].isdigit()
-                ]
                 next_num = max(batch_nums) + 1 if batch_nums else 0
                 target_path = self.batches_dir / f"batch_{next_num:08d}.arrow"
                 try:
                     os.rename(tmp_path, target_path)
                     self.series_counter += len(batch_data)
-                    logging.info(
-                        f"Saved {target_path.name} with {len(batch_data)} series."
-                    )
                     return
                 except FileExistsError:
-                    logging.warning(
-                        f"Race condition on {target_path.name}. Retrying..."
-                    )
                     time.sleep(random.uniform(0.1, 1.0))
-            raise IOError("Failed to write batch due to file conflicts.")
         finally:
             if tmp_path and os.path.exists(tmp_path):
                 os.remove(tmp_path)
@@ -178,16 +166,14 @@ class GeneratorWrapper:
         generator_type: str,
         length: int = 2048,
         global_seed: int = 42,
-        num_channels: Optional[int] = None,
     ):
         self.generator_type = generator_type
         self.length = length
         self.is_multivariate = generator_type.lower() in [
             "cauker_multivariate",
         ]
-        self.explode_multivariate_to_univariate = (
-            generator_type.lower() == "cauker_univariate"
-        )
         self._explode_channels = 0
         # Create appropriate parameter object and wrapper
@@ -233,9 +219,7 @@ class GeneratorWrapper:
             self._explode_channels = 6
         elif generator_type.lower() == "cauker_multivariate":
             effective_channels = (
-                int(num_channels)
-                if num_channels is not None
-                else CauKerGeneratorParams().num_channels  # type: ignore[arg-type]
             )
             params = CauKerGeneratorParams(
                 global_seed=global_seed,
@@ -295,18 +279,14 @@ class GeneratorWrapper:
         else:
             raise ValueError(f"Unsupported generator type: {generator_type}")
-    def generate_batch(self, batch_size: int, start_seed: int) -> List[Dict[str, Any]]:
         """Generate a batch of time series using the wrapper's batch generation."""
         try:
             if self.explode_multivariate_to_univariate and self._explode_channels > 0:
                 base_batch_size = int(np.ceil(batch_size / self._explode_channels))
-                container = self.wrapper.generate_batch(
-                    batch_size=base_batch_size, seed=start_seed
-                )
             else:
-                container = self.wrapper.generate_batch(
-                    batch_size=batch_size, seed=start_seed
-                )
             batch_data = []
             container_batch_size = container.values.shape[0]
@@ -316,14 +296,10 @@ class GeneratorWrapper:
                 if self.explode_multivariate_to_univariate:
                     series_data = container.values[i]
                     if series_data.ndim != 2:
-                        raise ValueError(
-                            "Expected multivariate data for CauKer univariate mode"
-                        )
                     num_channels = series_data.shape[1]
                     for channel in range(num_channels):
-                        channel_values = self._ensure_proper_format(
-                            series_data[:, channel]
-                        )
                         values_list = [channel_values.tolist()]
                         batch_data.append(
                             {
@@ -341,10 +317,7 @@ class GeneratorWrapper:
                 elif self.is_multivariate:
                     series_data = container.values[i]
                     num_channels = series_data.shape[1]
-                    values_list = [
-                        self._ensure_proper_format(series_data[:, c]).tolist()
-                        for c in range(num_channels)
-                    ]
                     seq_length = len(values_list[0])
                 else:
                     values = self._ensure_proper_format(container.values[i, :])
@@ -377,9 +350,7 @@ class GeneratorWrapper:
     def _ensure_proper_format(self, values: Any) -> np.ndarray:
         values = np.asarray(values).flatten()
         if len(values) != self.length:
-            logging.warning(
-                f"Generated series length {len(values)} != expected {self.length}. Padding/truncating."
-            )
             if len(values) > self.length:
                 values = values[: self.length]
             else:
@@ -400,7 +371,7 @@ class ContinuousGenerator:
         self.batch_size = batch_size
         self.run_id = run_id
         self.series_in_run = 0
-        self.partial_batch_data: List[Dict[str, Any]] = []
         self.shutting_down = False
         logging.info(f"Generator initialized for run_id: {self.run_id}")
@@ -413,13 +384,9 @@ class ContinuousGenerator:
             if self.shutting_down:
                 return
             self.shutting_down = True
-            logging.warning(
-                f"\nSignal {signal.Signals(signum).name} received. Shutting down."
-            )
             if self.partial_batch_data:
-                logging.info(
-                    f"Saving incomplete batch of {len(self.partial_batch_data)} series..."
-                )
                 try:
                     self.dataset_manager.append_batch(self.partial_batch_data)
                 except Exception as e:
@@ -447,9 +414,7 @@ class ContinuousGenerator:
             # Use modulo to ensure it stays within valid range
             series_id_start = (self.run_id + self.series_in_run) % (2**32)
-            new_chunk = self.generator_wrapper.generate_batch(
-                batch_size=chunk_size, start_seed=series_id_start
-            )
             if not new_chunk:
                 logging.error("Generator failed to produce data. Stopping job.")
@@ -465,11 +430,7 @@ class ContinuousGenerator:
                 batches_completed += 1
                 elapsed = time.time() - start_time
-                series_per_sec = (
-                    (batches_completed * self.batch_size) / elapsed
-                    if elapsed > 0
-                    else 0
-                )
                 print(
                     f"✓ Completed batch {batches_completed}/{num_batches_to_generate} in job | "
                     f"Total Series in DS: {self.dataset_manager.series_counter:,} | "
@@ -477,9 +438,7 @@ class ContinuousGenerator:
                 )
         if not self.shutting_down and self.partial_batch_data:
-            logging.info(
-                f"Job finished. Saving final partial batch of {len(self.partial_batch_data)}."
-            )
             self.dataset_manager.append_batch(self.partial_batch_data)
@@ -526,9 +485,7 @@ def main():
         required=True,
         help="Output directory for datasets",
     )
-    parser.add_argument(
-        "--length", type=int, default=2048, help="Length of each time series"
-    )
     parser.add_argument(
         "--batch-size",
         type=int,
@@ -559,13 +516,9 @@ def main():
     gen_name = args.generator.lower()
     if gen_name in ["cauker_multivariate"]:
         if args.num_channels is None or args.num_channels < 2:
-            logging.error(
-                "--num-channels (>=2) is required for multivariate generators"
-            )
             sys.exit(2)
-        dataset_dir_name = (
-            f"cauker_{args.num_channels}_variates"
-        )
     else:
         dataset_dir_name = args.generator
@@ -578,9 +531,7 @@ def main():
             global_seed=global_seed,
             num_channels=args.num_channels,
         )
-        dataset_manager = TimeSeriesDatasetManager(
-            str(output_path), batch_size=args.batch_size
-        )
         continuous_gen = ContinuousGenerator(
             generator_wrapper=generator_wrapper,
             dataset_manager=dataset_manager,

 import tempfile
 import time
 from pathlib import Path
+from typing import Any
 import numpy as np
 import pandas as pd
     FinancialVolatilityAudioParams,
     ForecastPFNGeneratorParams,
     GPGeneratorParams,
+    KernelGeneratorParams,
     MultiScaleFractalAudioParams,
     NetworkTopologyAudioParams,
     OrnsteinUhlenbeckProcessGeneratorParams,
 from src.synthetic_generation.gp_prior.gp_generator_wrapper import GPGeneratorWrapper
 from src.synthetic_generation.kernel_synth.kernel_generator_wrapper import (
     KernelGeneratorWrapper,
+)
 from src.synthetic_generation.ornstein_uhlenbeck_process.ou_generator_wrapper import (
     OrnsteinUhlenbeckProcessGeneratorWrapper,
 )
         """Returns the total number of series found on disk at initialization."""
         return self.series_counter
+    def append_batch(self, batch_data: list[dict[str, Any]]) -> None:
         """Appends a batch to a new file using an atomic rename for parallel safety."""
         if not batch_data:
             return
                 field_name = field.name
                 if field_name in ["start", "generation_timestamp"]:
                     timestamps = [d[field_name] for d in batch_data]
+                    arrays.append(pa.array([t.value for t in timestamps], type=pa.timestamp("ns")))
                 else:
                     arrays.append(pa.array([d[field_name] for d in batch_data]))
             new_table = pa.Table.from_arrays(arrays, schema=self.schema)
         tmp_path = None
         try:
+            with tempfile.NamedTemporaryFile(delete=False, dir=self.batches_dir, suffix=".arrow.tmp") as tmp:
                 tmp_path = tmp.name
                 feather.write_feather(new_table, tmp_path)
             max_retries = 20
             for _ in range(max_retries):
                 existing = self.batches_dir.glob("batch_*.arrow")
+                batch_nums = [int(p.stem.split("_")[1]) for p in existing if p.stem.split("_")[1].isdigit()]
                 next_num = max(batch_nums) + 1 if batch_nums else 0
                 target_path = self.batches_dir / f"batch_{next_num:08d}.arrow"
                 try:
                     os.rename(tmp_path, target_path)
                     self.series_counter += len(batch_data)
+                    logging.info(f"Saved {target_path.name} with {len(batch_data)} series.")
                     return
                 except FileExistsError:
+                    logging.warning(f"Race condition on {target_path.name}. Retrying...")
                     time.sleep(random.uniform(0.1, 1.0))
+            raise OSError("Failed to write batch due to file conflicts.")
         finally:
             if tmp_path and os.path.exists(tmp_path):
                 os.remove(tmp_path)
         generator_type: str,
         length: int = 2048,
         global_seed: int = 42,
+        num_channels: int | None = None,
     ):
         self.generator_type = generator_type
         self.length = length
         self.is_multivariate = generator_type.lower() in [
             "cauker_multivariate",
         ]
+        self.explode_multivariate_to_univariate = generator_type.lower() == "cauker_univariate"
         self._explode_channels = 0
         # Create appropriate parameter object and wrapper
             self._explode_channels = 6
         elif generator_type.lower() == "cauker_multivariate":
             effective_channels = (
+                int(num_channels) if num_channels is not None else CauKerGeneratorParams().num_channels  # type: ignore[arg-type]
             )
             params = CauKerGeneratorParams(
                 global_seed=global_seed,
         else:
             raise ValueError(f"Unsupported generator type: {generator_type}")
+    def generate_batch(self, batch_size: int, start_seed: int) -> list[dict[str, Any]]:
         """Generate a batch of time series using the wrapper's batch generation."""
         try:
             if self.explode_multivariate_to_univariate and self._explode_channels > 0:
                 base_batch_size = int(np.ceil(batch_size / self._explode_channels))
+                container = self.wrapper.generate_batch(batch_size=base_batch_size, seed=start_seed)
             else:
+                container = self.wrapper.generate_batch(batch_size=batch_size, seed=start_seed)
             batch_data = []
             container_batch_size = container.values.shape[0]
                 if self.explode_multivariate_to_univariate:
                     series_data = container.values[i]
                     if series_data.ndim != 2:
+                        raise ValueError("Expected multivariate data for CauKer univariate mode")
                     num_channels = series_data.shape[1]
                     for channel in range(num_channels):
+                        channel_values = self._ensure_proper_format(series_data[:, channel])
                         values_list = [channel_values.tolist()]
                         batch_data.append(
                             {
                 elif self.is_multivariate:
                     series_data = container.values[i]
                     num_channels = series_data.shape[1]
+                    values_list = [self._ensure_proper_format(series_data[:, c]).tolist() for c in range(num_channels)]
                     seq_length = len(values_list[0])
                 else:
                     values = self._ensure_proper_format(container.values[i, :])
     def _ensure_proper_format(self, values: Any) -> np.ndarray:
         values = np.asarray(values).flatten()
         if len(values) != self.length:
+            logging.warning(f"Generated series length {len(values)} != expected {self.length}. Padding/truncating.")
             if len(values) > self.length:
                 values = values[: self.length]
             else:
         self.batch_size = batch_size
         self.run_id = run_id
         self.series_in_run = 0
+        self.partial_batch_data: list[dict[str, Any]] = []
         self.shutting_down = False
         logging.info(f"Generator initialized for run_id: {self.run_id}")
             if self.shutting_down:
                 return
             self.shutting_down = True
+            logging.warning(f"\nSignal {signal.Signals(signum).name} received. Shutting down.")
             if self.partial_batch_data:
+                logging.info(f"Saving incomplete batch of {len(self.partial_batch_data)} series...")
                 try:
                     self.dataset_manager.append_batch(self.partial_batch_data)
                 except Exception as e:
             # Use modulo to ensure it stays within valid range
             series_id_start = (self.run_id + self.series_in_run) % (2**32)
+            new_chunk = self.generator_wrapper.generate_batch(batch_size=chunk_size, start_seed=series_id_start)
             if not new_chunk:
                 logging.error("Generator failed to produce data. Stopping job.")
                 batches_completed += 1
                 elapsed = time.time() - start_time
+                series_per_sec = (batches_completed * self.batch_size) / elapsed if elapsed > 0 else 0
                 print(
                     f"✓ Completed batch {batches_completed}/{num_batches_to_generate} in job | "
                     f"Total Series in DS: {self.dataset_manager.series_counter:,} | "
                 )
         if not self.shutting_down and self.partial_batch_data:
+            logging.info(f"Job finished. Saving final partial batch of {len(self.partial_batch_data)}.")
             self.dataset_manager.append_batch(self.partial_batch_data)
         required=True,
         help="Output directory for datasets",
     )
+    parser.add_argument("--length", type=int, default=2048, help="Length of each time series")
     parser.add_argument(
         "--batch-size",
         type=int,
     gen_name = args.generator.lower()
     if gen_name in ["cauker_multivariate"]:
         if args.num_channels is None or args.num_channels < 2:
+            logging.error("--num-channels (>=2) is required for multivariate generators")
             sys.exit(2)
+        dataset_dir_name = f"cauker_{args.num_channels}_variates"
     else:
         dataset_dir_name = args.generator
             global_seed=global_seed,
             num_channels=args.num_channels,
         )
+        dataset_manager = TimeSeriesDatasetManager(str(output_path), batch_size=args.batch_size)
         continuous_gen = ContinuousGenerator(
             generator_wrapper=generator_wrapper,
             dataset_manager=dataset_manager,