Vladyslav Moroshan

Apply ruff formatting

0a58567 about 1 month ago

16 kB

	import torch
	import torch.nn as nn
	from fla.modules import GatedMLP

	from src.data.containers import BatchTimeSeriesContainer
	from src.data.scalers import MinMaxScaler, RobustScaler
	from src.data.time_features import compute_batch_time_features
	from src.models.blocks import GatedDeltaProductEncoder
	from src.utils.utils import device


	def create_scaler(scaler_type: str, epsilon: float = 1e-3):
	"""Create scaler instance based on type."""
	if scaler_type == "custom_robust":
	return RobustScaler(epsilon=epsilon)
	elif scaler_type == "min_max":
	return MinMaxScaler(epsilon=epsilon)
	else:
	raise ValueError(f"Unknown scaler: {scaler_type}")


	def apply_channel_noise(values: torch.Tensor, noise_scale: float = 0.1):
	"""Add noise to constant channels to prevent model instability."""
	is_constant = torch.all(values == values[:, 0:1, :], dim=1)
	noise = torch.randn_like(values) * noise_scale * is_constant.unsqueeze(1)
	return values + noise


	class TimeSeriesModel(nn.Module):
	"""Time series forecasting model combining embedding, encoding, and prediction."""

	def __init__(
	self,
	# Core architecture
	embed_size: int = 128,
	num_encoder_layers: int = 2,
	# Scaling and preprocessing
	scaler: str = "custom_robust",
	epsilon: float = 1e-3,
	scaler_clamp_value: float = None,
	handle_constants: bool = False,
	# Time features
	K_max: int = 6,
	time_feature_config: dict = None,
	encoding_dropout: float = 0.0,
	# Encoder configuration
	encoder_config: dict = None,
	# Loss configuration
	loss_type: str = "huber", # "huber", "quantile"
	quantiles: list[float] = None,
	**kwargs,
	):
	super().__init__()

	# Core parameters
	self.embed_size = embed_size
	self.num_encoder_layers = num_encoder_layers
	self.epsilon = epsilon
	self.scaler_clamp_value = scaler_clamp_value
	self.handle_constants = handle_constants
	self.encoding_dropout = encoding_dropout
	self.K_max = K_max
	self.time_feature_config = time_feature_config or {}
	self.encoder_config = encoder_config or {}

	# Store loss parameters
	self.loss_type = loss_type
	self.quantiles = quantiles
	if self.loss_type == "quantile" and self.quantiles is None:
	raise ValueError("Quantiles must be provided for quantile loss.")
	if self.quantiles:
	self.register_buffer("qt", torch.tensor(self.quantiles, device=device).view(1, 1, 1, -1))

	# Validate configuration before initialization
	self._validate_configuration()

	# Initialize components
	self.scaler = create_scaler(scaler, epsilon)
	self._init_embedding_layers()
	self._init_encoder_layers(self.encoder_config, num_encoder_layers)
	self._init_projection_layers()

	def _validate_configuration(self):
	"""Validate essential model configuration parameters."""
	if "num_heads" not in self.encoder_config:
	raise ValueError("encoder_config must contain 'num_heads' parameter")

	if self.embed_size % self.encoder_config["num_heads"] != 0:
	raise ValueError(
	f"embed_size ({self.embed_size}) must be divisible by num_heads ({self.encoder_config['num_heads']})"
	)

	def _init_embedding_layers(self):
	"""Initialize value and time feature embedding layers."""
	self.expand_values = nn.Linear(1, self.embed_size, bias=True)
	self.nan_embedding = nn.Parameter(
	torch.randn(1, 1, 1, self.embed_size) / self.embed_size,
	requires_grad=True,
	)
	self.time_feature_projection = nn.Linear(self.K_max, self.embed_size)

	def _init_encoder_layers(self, encoder_config: dict, num_encoder_layers: int):
	"""Initialize encoder layers."""
	self.num_encoder_layers = num_encoder_layers

	# Ensure encoder_config has token_embed_dim
	encoder_config = encoder_config.copy()
	encoder_config["token_embed_dim"] = self.embed_size
	self.encoder_layers = nn.ModuleList(
	[
	GatedDeltaProductEncoder(layer_idx=layer_idx, **encoder_config)
	for layer_idx in range(self.num_encoder_layers)
	]
	)

	def _init_projection_layers(self):
	if self.loss_type == "quantile":
	output_dim = len(self.quantiles)
	else:
	output_dim = 1
	self.final_output_layer = nn.Linear(self.embed_size, output_dim)

	self.mlp = GatedMLP(
	hidden_size=self.embed_size,
	hidden_ratio=4,
	hidden_act="swish",
	fuse_swiglu=True,
	)
	# Initialize learnable initial hidden state for the first encoder layer
	# This will be expanded to match batch size during forward pass
	head_k_dim = self.embed_size // self.encoder_config["num_heads"]

	# Get expand_v from encoder_config, default to 1.0 if not present
	expand_v = self.encoder_config.get("expand_v", 1.0)
	head_v_dim = int(head_k_dim * expand_v)

	num_initial_hidden_states = self.num_encoder_layers
	self.initial_hidden_state = nn.ParameterList(
	[
	nn.Parameter(
	torch.randn(1, self.encoder_config["num_heads"], head_k_dim, head_v_dim) / head_k_dim,
	requires_grad=True,
	)
	for _ in range(num_initial_hidden_states)
	]
	)

	def _preprocess_data(self, data_container: BatchTimeSeriesContainer):
	"""Extract data shapes and handle constants without padding."""
	history_values = data_container.history_values
	future_values = data_container.future_values
	history_mask = data_container.history_mask

	batch_size, history_length, num_channels = history_values.shape
	future_length = future_values.shape[1] if future_values is not None else 0

	# Handle constants
	if self.handle_constants:
	history_values = apply_channel_noise(history_values)

	return {
	"history_values": history_values,
	"future_values": future_values,
	"history_mask": history_mask,
	"num_channels": num_channels,
	"history_length": history_length,
	"future_length": future_length,
	"batch_size": batch_size,
	}

	def _compute_scaling(self, history_values: torch.Tensor, history_mask: torch.Tensor = None):
	"""Compute scaling statistics and apply scaling."""
	scale_statistics = self.scaler.compute_statistics(history_values, history_mask)
	return scale_statistics

	def _apply_scaling_and_masking(self, values: torch.Tensor, scale_statistics: dict, mask: torch.Tensor = None):
	"""Apply scaling and optional masking to values."""
	scaled_values = self.scaler.scale(values, scale_statistics)

	if mask is not None:
	scaled_values = scaled_values * mask.unsqueeze(-1).float()

	if self.scaler_clamp_value is not None:
	scaled_values = torch.clamp(scaled_values, -self.scaler_clamp_value, self.scaler_clamp_value)

	return scaled_values

	def _get_positional_embeddings(
	self,
	time_features: torch.Tensor,
	num_channels: int,
	batch_size: int,
	drop_enc_allow: bool = False,
	):
	"""Generate positional embeddings from time features."""
	seq_len = time_features.shape[1]

	if (torch.rand(1).item() < self.encoding_dropout) and drop_enc_allow:
	return torch.zeros(batch_size, seq_len, num_channels, self.embed_size, device=device).to(torch.float32)

	pos_embed = self.time_feature_projection(time_features)
	return pos_embed.unsqueeze(2).expand(-1, -1, num_channels, -1)

	def _compute_embeddings(
	self,
	scaled_history: torch.Tensor,
	history_pos_embed: torch.Tensor,
	history_mask: torch.Tensor \| None = None,
	):
	"""Compute value embeddings and combine with positional embeddings."""

	nan_mask = torch.isnan(scaled_history)
	history_for_embedding = torch.nan_to_num(scaled_history, nan=0.0)
	channel_embeddings = self.expand_values(history_for_embedding.unsqueeze(-1))
	channel_embeddings[nan_mask] = self.nan_embedding.to(channel_embeddings.dtype)
	channel_embeddings = channel_embeddings + history_pos_embed

	# Suppress padded time steps completely so padding is a pure batching artifact
	# history_mask: [B, S] -> broadcast to [B, S, 1, 1]
	if history_mask is not None:
	mask_broadcast = history_mask.unsqueeze(-1).unsqueeze(-1).to(channel_embeddings.dtype)
	channel_embeddings = channel_embeddings * mask_broadcast

	batch_size, seq_len = scaled_history.shape[:2]
	all_channels_embedded = channel_embeddings.view(batch_size, seq_len, -1)

	return all_channels_embedded

	def _generate_predictions(
	self,
	embedded: torch.Tensor,
	target_pos_embed: torch.Tensor,
	prediction_length: int,
	num_channels: int,
	history_mask: torch.Tensor = None,
	):
	"""
	Generate predictions for all channels using vectorized operations.
	"""
	batch_size, seq_len, _ = embedded.shape
	# embedded shape: [B, S, N*E] -> Reshape to [B, S, N, E]
	embedded = embedded.view(batch_size, seq_len, num_channels, self.embed_size)

	# Vectorize across channels by merging the batch and channel dimensions.
	# [B, S, N, E] -> [B*N, S, E]
	channel_embedded = (
	embedded.permute(0, 2, 1, 3).contiguous().view(batch_size * num_channels, seq_len, self.embed_size)
	)

	# Reshape target positional embeddings similarly: [B, P, N, E] -> [B*N, P, E]
	target_pos_embed = (
	target_pos_embed.permute(0, 2, 1, 3)
	.contiguous()
	.view(batch_size * num_channels, prediction_length, self.embed_size)
	)
	x = channel_embedded
	target_repr = target_pos_embed
	x = torch.concatenate([x, target_repr], dim=1)
	if self.encoder_config.get("weaving", True):
	# initial hidden state is learnable
	hidden_state = torch.zeros_like(self.initial_hidden_state[0].repeat(batch_size * num_channels, 1, 1, 1))
	for layer_idx, encoder_layer in enumerate(self.encoder_layers):
	x, hidden_state = encoder_layer(
	x,
	hidden_state + self.initial_hidden_state[layer_idx].repeat(batch_size * num_channels, 1, 1, 1),
	)
	else:
	# initial hidden state is separately learnable for each layer
	for layer_idx, encoder_layer in enumerate(self.encoder_layers):
	initial_hidden_state = self.initial_hidden_state[layer_idx].repeat(batch_size * num_channels, 1, 1, 1)
	x, _ = encoder_layer(x, initial_hidden_state)

	# Use the last prediction_length positions
	prediction_embeddings = x[:, -prediction_length:, :]

	predictions = self.final_output_layer(self.mlp(prediction_embeddings))

	# Reshape output to handle quantiles
	# Original shape: [B*N, P, Q] where Q is num_quantiles or 1
	# Reshape the output back to [B, P, N, Q]
	output_dim = len(self.quantiles) if self.loss_type == "quantile" else 1
	predictions = predictions.view(batch_size, num_channels, prediction_length, output_dim)
	predictions = predictions.permute(0, 2, 1, 3) # [B, P, N, Q]
	# Squeeze the last dimension if not in quantile mode for backward compatibility
	if self.loss_type != "quantile":
	predictions = predictions.squeeze(-1) # [B, P, N]
	return predictions

	def forward(self, data_container: BatchTimeSeriesContainer, drop_enc_allow: bool = False):
	"""Main forward pass."""
	# Preprocess data
	preprocessed = self._preprocess_data(data_container)

	# Compute time features dynamically based on actual lengths
	history_time_features, target_time_features = compute_batch_time_features(
	start=data_container.start,
	history_length=preprocessed["history_length"],
	future_length=preprocessed["future_length"],
	batch_size=preprocessed["batch_size"],
	frequency=data_container.frequency,
	K_max=self.K_max,
	time_feature_config=self.time_feature_config,
	)

	# Compute scaling
	scale_statistics = self._compute_scaling(preprocessed["history_values"], preprocessed["history_mask"])

	# Apply scaling
	history_scaled = self._apply_scaling_and_masking(
	preprocessed["history_values"],
	scale_statistics,
	preprocessed["history_mask"],
	)

	# Scale future values if present
	future_scaled = None
	if preprocessed["future_values"] is not None:
	future_scaled = self.scaler.scale(preprocessed["future_values"], scale_statistics)

	# Get positional embeddings
	history_pos_embed = self._get_positional_embeddings(
	history_time_features,
	preprocessed["num_channels"],
	preprocessed["batch_size"],
	drop_enc_allow,
	)
	target_pos_embed = self._get_positional_embeddings(
	target_time_features,
	preprocessed["num_channels"],
	preprocessed["batch_size"],
	drop_enc_allow,
	)

	# Compute embeddings
	history_embed = self._compute_embeddings(history_scaled, history_pos_embed, preprocessed["history_mask"])

	# Generate predictions
	predictions = self._generate_predictions(
	history_embed,
	target_pos_embed,
	preprocessed["future_length"],
	preprocessed["num_channels"],
	preprocessed["history_mask"],
	)

	return {
	"result": predictions,
	"scale_statistics": scale_statistics,
	"future_scaled": future_scaled,
	"history_length": preprocessed["history_length"],
	"future_length": preprocessed["future_length"],
	}

	def _quantile_loss(self, y_true: torch.Tensor, y_pred: torch.Tensor):
	"""
	Compute the quantile loss.
	y_true: [B, P, N]
	y_pred: [B, P, N, Q]
	"""
	# Add a dimension to y_true to match y_pred: [B, P, N] -> [B, P, N, 1]
	y_true = y_true.unsqueeze(-1)

	# Calculate errors
	errors = y_true - y_pred

	# Calculate quantile loss
	# The max operator implements the two cases of the quantile loss formula
	loss = torch.max((self.qt - 1) * errors, self.qt * errors)

	# Average the loss across all dimensions
	return loss.mean()

	def compute_loss(self, y_true: torch.Tensor, y_pred: dict):
	"""Compute loss between predictions and scaled ground truth."""
	predictions = y_pred["result"]
	scale_statistics = y_pred["scale_statistics"]

	if y_true is None:
	return torch.tensor(0.0, device=predictions.device)

	future_scaled = self.scaler.scale(y_true, scale_statistics)

	if self.loss_type == "huber":
	if predictions.shape != future_scaled.shape:
	raise ValueError(
	f"Shape mismatch for Huber loss: predictions {predictions.shape} "
	f"vs future_scaled {future_scaled.shape}"
	)
	return nn.functional.huber_loss(predictions, future_scaled)
	elif self.loss_type == "quantile":
	return self._quantile_loss(future_scaled, predictions)
	else:
	raise ValueError(f"Unknown loss type: {self.loss_type}")