Map-NEO / model_neo.py

Upload folder using huggingface_hub

a683148 verified 3 months ago

11.5 kB

	# MAP-NEO Mini Model Architecture
	# Scaled-down version of MAP-NEO (300M parameters) with RMSNorm, RoPE, and Flash Attention

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Optional, Tuple
	import json


	class RMSNorm(nn.Module):
	"""Root Mean Square Layer Normalization (same as MAP-NEO)"""
	def __init__(self, dim: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x):
	# RMS normalization
	norm = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
	return x * norm * self.weight


	class RotaryPositionalEmbedding(nn.Module):
	"""Rotary Position Embedding (RoPE) - same as MAP-NEO"""
	def __init__(self, dim: int, max_len: int = 8192, theta: float = 10000.0):
	super().__init__()
	self.dim = dim
	self.max_len = max_len
	self.theta = theta

	# Precompute frequencies
	freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer("freqs", freqs, persistent=False)

	def forward(self, x: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
	# x shape: [batch, seq_len, n_heads, head_dim]
	device = x.device
	positions = torch.arange(seq_len, device=device).float()

	# Compute angles
	angles = positions.unsqueeze(1) * self.freqs.unsqueeze(0) # [seq_len, dim//2]

	cos = torch.cos(angles) # [seq_len, dim//2]
	sin = torch.sin(angles) # [seq_len, dim//2]

	return cos, sin


	def apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	"""Apply rotary embedding to query/key tensors"""
	# x: [batch, seq_len, n_heads, head_dim]
	# Split into real and imaginary parts
	x1, x2 = x[..., ::2], x[..., 1::2] # Even and odd indices

	# Apply rotation
	rotated = torch.cat([
	x1 * cos.unsqueeze(0).unsqueeze(-2) - x2 * sin.unsqueeze(0).unsqueeze(-2),
	x1 * sin.unsqueeze(0).unsqueeze(-2) + x2 * cos.unsqueeze(0).unsqueeze(-2)
	], dim=-1)

	return rotated


	class MultiHeadAttention(nn.Module):
	"""Multi-head attention with RoPE and optional Flash Attention"""
	def __init__(self, dim: int, n_heads: int, dropout: float = 0.0):
	super().__init__()
	assert dim % n_heads == 0

	self.dim = dim
	self.n_heads = n_heads
	self.head_dim = dim // n_heads
	self.scale = self.head_dim ** -0.5

	# Linear projections
	self.q_proj = nn.Linear(dim, dim, bias=False)
	self.k_proj = nn.Linear(dim, dim, bias=False)
	self.v_proj = nn.Linear(dim, dim, bias=False)
	self.o_proj = nn.Linear(dim, dim, bias=False)

	self.dropout = nn.Dropout(dropout)

	# RoPE
	self.rotary_emb = RotaryPositionalEmbedding(self.head_dim)

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	batch_size, seq_len, dim = x.shape

	# Project to Q, K, V
	q = self.q_proj(x) # [batch, seq_len, dim]
	k = self.k_proj(x) # [batch, seq_len, dim]
	v = self.v_proj(x) # [batch, seq_len, dim]

	# Reshape for multi-head attention
	q = q.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
	k = k.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
	v = v.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

	# Apply RoPE to Q and K
	cos, sin = self.rotary_emb(q, seq_len)
	q = apply_rotary_emb(q.transpose(1, 2), cos, sin).transpose(1, 2)
	k = apply_rotary_emb(k.transpose(1, 2), cos, sin).transpose(1, 2)

	# Try Flash Attention 2 if available
	try:
	# Flash Attention 2 format: [batch, seq_len, n_heads, head_dim]
	q_flash = q.transpose(1, 2) # [batch, seq_len, n_heads, head_dim]
	k_flash = k.transpose(1, 2) # [batch, seq_len, n_heads, head_dim]
	v_flash = v.transpose(1, 2) # [batch, seq_len, n_heads, head_dim]

	# Use Flash Attention (causal mask built-in)
	out = F.scaled_dot_product_attention(
	q_flash.transpose(1, 2), k_flash.transpose(1, 2), v_flash.transpose(1, 2),
	attn_mask=None, # Causal mask applied automatically
	dropout_p=self.dropout.p if self.training else 0.0,
	is_causal=True
	)
	out = out.transpose(1, 2) # Back to [batch, seq_len, n_heads, head_dim]

	except:
	# Fallback to manual attention
	scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale

	# Apply causal mask
	causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
	scores = scores.masked_fill(causal_mask, float('-inf'))

	# Apply attention mask if provided
	if attention_mask is not None:
	scores = scores.masked_fill(~attention_mask.unsqueeze(1).unsqueeze(1), float('-inf'))

	attn_weights = F.softmax(scores, dim=-1)
	attn_weights = self.dropout(attn_weights)

	out = torch.matmul(attn_weights, v) # [batch, n_heads, seq_len, head_dim]
	out = out.transpose(1, 2) # [batch, seq_len, n_heads, head_dim]

	# Concat heads and project
	out = out.contiguous().view(batch_size, seq_len, dim)
	out = self.o_proj(out)

	return out


	class FeedForward(nn.Module):
	"""SwiGLU Feed-Forward Network (same as MAP-NEO)"""
	def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0):
	super().__init__()
	self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
	self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
	self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# SwiGLU activation: swish(gate) * up
	gate = F.silu(self.gate_proj(x)) # SiLU = Swish
	up = self.up_proj(x)
	hidden = gate * up
	hidden = self.dropout(hidden)
	return self.down_proj(hidden)


	class TransformerBlock(nn.Module):
	"""Transformer block with pre-norm (RMSNorm)"""
	def __init__(self, dim: int, n_heads: int, hidden_dim: int, dropout: float = 0.0):
	super().__init__()
	self.attention_norm = RMSNorm(dim)
	self.attention = MultiHeadAttention(dim, n_heads, dropout)
	self.ffn_norm = RMSNorm(dim)
	self.ffn = FeedForward(dim, hidden_dim, dropout)

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	# Pre-norm attention
	h = x + self.attention(self.attention_norm(x), attention_mask)

	# Pre-norm FFN
	h = h + self.ffn(self.ffn_norm(h))

	return h


	class NeoMiniConfig:
	"""Configuration for MAP-NEO Mini (300M parameters)"""
	def __init__(self):
	# Model architecture
	self.vocab_size = 50257 # GPT-2 tokenizer vocab size (will update for MAP-NEO tokenizer)
	self.max_seq_len = 2048
	self.dim = 1024 # Hidden dimension
	self.n_layers = 16 # Number of transformer layers
	self.n_heads = 16 # Number of attention heads
	self.hidden_dim = 2736 # FFN hidden dimension (2.67x of dim)

	# Training
	self.dropout = 0.0 # No dropout for pretraining

	# Approximated parameter count: ~300M

	def to_dict(self):
	return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

	@classmethod
	def from_dict(cls, config_dict):
	config = cls()
	for k, v in config_dict.items():
	setattr(config, k, v)
	return config


	class NeoMini(nn.Module):
	"""MAP-NEO Mini Language Model (300M parameters)"""
	def __init__(self, config: NeoMiniConfig):
	super().__init__()
	self.config = config

	# Embeddings
	self.token_embedding = nn.Embedding(config.vocab_size, config.dim)

	# Transformer blocks
	self.blocks = nn.ModuleList([
	TransformerBlock(
	dim=config.dim,
	n_heads=config.n_heads,
	hidden_dim=config.hidden_dim,
	dropout=config.dropout
	)
	for _ in range(config.n_layers)
	])

	# Output
	self.ln_f = RMSNorm(config.dim)
	self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=False)

	# Tie weights (common in modern LLMs)
	self.lm_head.weight = self.token_embedding.weight

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	# Token embeddings
	x = self.token_embedding(input_ids)

	# Apply transformer blocks
	for block in self.blocks:
	x = block(x, attention_mask)

	# Final layer norm and projection
	x = self.ln_f(x)
	logits = self.lm_head(x)

	return logits

	def get_num_params(self):
	"""Count model parameters"""
	return sum(p.numel() for p in self.parameters())

	def save_config(self, path: str):
	"""Save model configuration"""
	with open(path, 'w') as f:
	json.dump(self.config.to_dict(), f, indent=2)

	@classmethod
	def from_config(cls, config_path: str):
	"""Load model from configuration"""
	with open(config_path, 'r') as f:
	config_dict = json.load(f)
	config = NeoMiniConfig.from_dict(config_dict)
	return cls(config)


	def create_model():
	"""Create a MAP-NEO Mini model"""
	config = NeoMiniConfig()
	model = NeoMini(config)

	print(f"Created MAP-NEO Mini with {model.get_num_params():,} parameters")
	print(f"Config: {config.n_layers} layers, {config.dim} dim, {config.n_heads} heads")

	return model, config


	if __name__ == "__main__":
	# Test model creation
	model, config = create_model()

	# Test forward pass
	batch_size, seq_len = 2, 512
	input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))

	with torch.no_grad():
	logits = model(input_ids)
	print(f"Input shape: {input_ids.shape}")
	print(f"Output shape: {logits.shape}")
	print("Model test passed!")