| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import time | |
| from dataclasses import dataclass | |
| from functools import lru_cache | |
| from pathlib import Path | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| torch.empty(1, device='cuda', requires_grad=True).backward() # prevents a bug on some systems | |
| from torch import Tensor, nn | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| # use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
| torch._inductor.config.coordinate_descent_tuning = True | |
| # ----------------------------------------------------------------------------- | |
| # Custom operators | |
| @torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
| def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
| @torch.compile | |
| def impl(x: Tensor, w: Tensor): | |
| assert x.is_contiguous() and w.is_contiguous() | |
| x_f8 = x.mul(x_s).to(torch.float8_e4m3fn) | |
| w_f8 = w.mul(w_s).to(torch.float8_e4m3fn) | |
| out = torch._scaled_mm( | |
| x_f8, | |
| w_f8.t(), | |
| out_dtype=torch.bfloat16, | |
| scale_a=x.new_tensor(1 / x_s, dtype=torch.float32), | |
| scale_b=x.new_tensor(1 / w_s, dtype=torch.float32), | |
| use_fast_accum=True, | |
| ) | |
| return out, x_f8, w_f8 | |
| return impl(x, w) | |
| @mm_op.register_fake | |
| def _(x: Tensor, w: Tensor, *_): | |
| assert x.ndim == w.ndim == 2 | |
| assert x.shape[1] == w.shape[1] | |
| assert x.device == w.device | |
| assert x.is_contiguous() and w.is_contiguous() | |
| return x @ w.t(), x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
| @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
| def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
| @torch.compile | |
| def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
| assert grad.is_contiguous() | |
| x_inv_s = grad.new_tensor(1 / x_s, dtype=torch.float32) | |
| w_inv_s = grad.new_tensor(1 / w_s, dtype=torch.float32) | |
| grad_inv_s = grad.new_tensor(1 / grad_s, dtype=torch.float32) | |
| grad_f8 = grad.mul(grad_s).to(torch.float8_e5m2) | |
| grad_x = torch._scaled_mm( | |
| grad_f8, | |
| w_f8.t().contiguous().t(), | |
| out_dtype=torch.bfloat16, | |
| scale_a=grad_inv_s, | |
| scale_b=w_inv_s, | |
| use_fast_accum=False, | |
| ) | |
| # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
| grad_w = torch._scaled_mm( | |
| x_f8.t().contiguous(), | |
| grad_f8.t().contiguous().t(), | |
| out_dtype=torch.float32, | |
| scale_a=x_inv_s, | |
| scale_b=grad_inv_s, | |
| use_fast_accum=False, | |
| ).t() | |
| return grad_x, grad_w | |
| return impl(g, x_f8, w_f8) | |
| @mm_backward_op.register_fake | |
| def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
| return x_f8.to(torch.bfloat16), w_f8.to(torch.float32) | |
| def backward(ctx, grad_out: Tensor, *_): | |
| x_f8, w_f8 = ctx.saved_tensors | |
| x_s, w_s, grad_s = ctx.scales | |
| grad_x, grad_w = torch.ops.nanogpt.mm_backward( | |
| grad_out, x_f8, w_f8, x_s, w_s, grad_s | |
| ) | |
| return grad_x, grad_w, None, None, None | |
| def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
| *_, x_s, w_s, grad_s = inputs | |
| _, x_f8, w_f8 = output | |
| ctx.save_for_backward(x_f8, w_f8) | |
| ctx.scales = x_s, w_s, grad_s | |
| ctx.set_materialize_grads(False) | |
| mm_op.register_autograd(backward, setup_context=setup_context) | |
| def lm_head_fp8(x: Tensor, w: Tensor) -> Tensor: | |
| _x = x.flatten(0, -2) | |
| out: Tensor = torch.ops.nanogpt.mm(_x, w, x_s=2.0, w_s=32.0, grad_s=2.0**29)[0] | |
| return out.reshape(*x.shape[:-1], -1) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| @torch.compile | |
| def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert len(G.shape) == 2 | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G.bfloat16() | |
| if G.size(0) > G.size(1): | |
| X = X.T | |
| # Ensure spectral norm is at most 1 | |
| X = X / (X.norm() + 1e-7) | |
| # Perform the NS iterations | |
| for _ in range(steps): | |
| A = X @ X.T | |
| B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(0) > G.size(1): | |
| X = X.T | |
| return X | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Some warnings: | |
| - This optimizer assumes that all parameters passed in are 2D. | |
| - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D | |
| parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
| - We believe it is unlikely to work well for training with small batch size. | |
| - We believe it may not work well for finetuning pretrained models, but we haven't tested this. | |
| - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). | |
| Arguments: | |
| lr: The learning rate used by the internal SGD. | |
| momentum: The momentum used by the internal SGD. | |
| nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
| ns_steps: The number of Newton-Schulz iteration steps to use. | |
| """ | |
| def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1): | |
| self.rank = rank | |
| self.world_size = world_size | |
| defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
| params: list[Tensor] = [*params] | |
| assert all(isinstance(p, Tensor) for p in params) | |
| sizes = {p.numel() for p in params} | |
| def create_update_buffer(size: int): | |
| b = torch.empty(self.world_size, size, dtype=torch.bfloat16, device="cuda") | |
| return dict(update_buffer=b, update_buffer_views=[b[i] for i in range(self.world_size)]) | |
| param_groups = [ | |
| dict(params=[p for p in params if p.numel() == size], **create_update_buffer(size)) for size in sizes] | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| for group in self.param_groups: | |
| lr = group['lr'] | |
| momentum = group['momentum'] | |
| nesterov = group['nesterov'] | |
| ns_steps = group['ns_steps'] | |
| update_buffer = group['update_buffer'] | |
| update_buffer_views: list[Tensor] = group['update_buffer_views'] | |
| # generate weight updates in distributed fashion | |
| params: list[Tensor] = group['params'] | |
| handle = None | |
| params_world = None | |
| def update_prev(): | |
| if params_world is None: | |
| return | |
| assert handle is not None | |
| handle.wait() | |
| for p_world, g_world in zip(params_world, update_buffer_views): | |
| p_world.add_( | |
| g_world.view_as(p_world), | |
| alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, | |
| ) | |
| for base_i in range(len(params))[::self.world_size]: | |
| if base_i + self.rank < len(params): | |
| p = params[base_i + self.rank] | |
| g = p.grad | |
| assert g is not None | |
| state = self.state[p] | |
| if 'momentum_buffer' not in state: | |
| state['momentum_buffer'] = torch.zeros_like(g) | |
| buf: Tensor = state['momentum_buffer'] | |
| buf.lerp_(g, 1 - momentum) | |
| g = g.lerp_(buf, momentum) if nesterov else buf | |
| g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() | |
| else: | |
| g = update_buffer_views[self.rank] | |
| update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng | |
| handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True) | |
| params_world = params[base_i : base_i + self.world_size] | |
| update_prev() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the GPT-2 model | |
| def norm(x): | |
| return F.rms_norm(x, (x.size(-1),)) | |
| class CastedLinear(nn.Linear): | |
| def __init__(self, in_features: int, out_features: int): | |
| super().__init__(in_features, out_features, bias=False) | |
| def reset_parameters(self) -> None: | |
| std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
| bound = (3 ** 0.5) * std | |
| with torch.no_grad(): | |
| self.weight.uniform_(-bound, bound) | |
| def forward(self, x): | |
| return F.linear(x, self.weight.type_as(x)) | |
| class Rotary(nn.Module): | |
| def __init__(self, dim: int, max_seq_len=65536): | |
| super().__init__() | |
| # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
| angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
| angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
| t = torch.arange(max_seq_len, dtype=torch.float32) | |
| theta = torch.einsum('i,j -> ij', t, angular_freq) | |
| self.cos = nn.Buffer(theta.cos(), persistent=False) | |
| self.sin = nn.Buffer(theta.sin(), persistent=False) | |
| def forward(self, x_BTHD: Tensor): | |
| assert self.cos.size(0) >= x_BTHD.size(-3) | |
| cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] | |
| x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int): | |
| super().__init__() | |
| assert dim % num_heads == 0 | |
| self.num_heads = num_heads | |
| self.c_q = CastedLinear(dim, dim) | |
| self.c_k = CastedLinear(dim, dim) | |
| self.c_v = CastedLinear(dim, dim) | |
| self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
| self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim | |
| self.c_proj = CastedLinear(dim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor, ve: Tensor | None, block_mask: BlockMask): | |
| B, T = x.size(0), x.size(1) # batch size, sequence length | |
| assert B == 1, 'Must use batch size = 1 for FlexAttention' | |
| q = self.c_q(x).view(B, T, self.num_heads, -1) | |
| k = self.c_k(x).view(B, T, self.num_heads, -1) | |
| v = self.c_v(x).view(B, T, self.num_heads, -1) | |
| if ve is not None: | |
| v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
| else: # skip mid-layers token value embeddings by @YouJiacheng | |
| v = self.lambdas[0] * v | |
| q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
| q, k = self.rotary(q), self.rotary(k) | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) | |
| y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y | |
| class MLP(nn.Module): | |
| def __init__(self, dim): | |
| super().__init__() | |
| self.c_fc = CastedLinear(dim, 4 * dim) | |
| self.c_proj = CastedLinear(4 * dim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, model_dim: int, num_heads: int, layer_idx: int): | |
| super().__init__() | |
| # skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
| self.attn = CausalSelfAttention(model_dim, num_heads) if layer_idx != 7 else None | |
| self.mlp = MLP(model_dim) | |
| self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
| def forward(self, x, ve, x0, block_mask): | |
| x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
| if self.attn is not None: | |
| x = x + self.attn(norm(x), ve, block_mask) | |
| x = x + self.mlp(norm(x)) | |
| return x | |
| class ValueEmbedding(nn.Module): | |
| def __init__(self, num_embeddings: int, embedding_dim: int): | |
| super().__init__() | |
| self.embed = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for _ in range(3)]) | |
| def forward(self, input_seq) -> list[Tensor | None]: | |
| ve = [emb(input_seq) for emb in self.embed] | |
| # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
| ve = [ve[0], ve[1], ve[2], None, None, None, None, None, None, ve[0], ve[1], ve[2]] | |
| return ve | |
| # ----------------------------------------------------------------------------- | |
| # The main GPT-2 model | |
| def next_multiple_of_n(v: float | int, *, n: int): | |
| return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
| class GPT(nn.Module): | |
| def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int): | |
| super().__init__() | |
| self.embed = nn.Embedding(vocab_size, model_dim) | |
| # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning | |
| self.value_embeds = ValueEmbedding(vocab_size, model_dim) | |
| self.blocks = nn.ModuleList([Block(model_dim, num_heads, layer_idx) for layer_idx in range(num_layers)]) | |
| # U-net design by @brendanh0gan | |
| self.num_encoder_layers = num_layers // 2 # Half of the layers for encoder | |
| self.num_decoder_layers = num_layers - self.num_encoder_layers # Remaining for decoder | |
| # Add learnable skip connection weights for decoder layers | |
| self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
| # suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
| self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128)) | |
| self.lm_head.weight.detach().zero_() # @Grad62304977 | |
| def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| BLOCK_SIZE = 128 | |
| assert input_seq.ndim == 1 | |
| assert len(input_seq) % BLOCK_SIZE == 0 | |
| NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
| docs = (input_seq == 50256).cumsum(0) | |
| docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
| docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
| def document_causal(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| return causal_mask & document_mask | |
| def dense_to_ordered(dense_mask: Tensor): | |
| num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) | |
| indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) | |
| return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
| # manual block mask creation by @YouJiacheng | |
| def create_doc_swc_block_mask(sliding_window_num_blocks: Tensor): | |
| kv_idx = block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
| q_idx = block_idx[:, None] | |
| causal_bm = q_idx >= kv_idx | |
| causal_full_bm = q_idx > kv_idx | |
| window_bm = q_idx - kv_idx < sliding_window_num_blocks | |
| window_full_bm = window_bm # block-wise sliding window by @YouJiacheng | |
| # document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) | |
| document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) | |
| document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) | |
| nonzero_bm = causal_bm & window_bm & document_bm | |
| full_bm = causal_full_bm & window_full_bm & document_full_bm | |
| kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm & ~full_bm) | |
| full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) | |
| return BlockMask.from_kv_blocks( | |
| kv_num_blocks, | |
| kv_indices, | |
| full_kv_num_blocks, | |
| full_kv_indices, | |
| BLOCK_SIZE=BLOCK_SIZE, | |
| mask_mod=document_causal, | |
| ) | |
| block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) | |
| x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
| ve = self.value_embeds(input_seq) | |
| ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] | |
| assert len(ve_enc) == self.num_encoder_layers and len(ve_dec) == self.num_decoder_layers | |
| # Store outputs for U-Net skip connections | |
| skip_connections = [] | |
| # Encoder pass - process only the first half of the blocks | |
| for i in range(self.num_encoder_layers): | |
| x = self.blocks[i](x, ve_enc[i], x0, block_mask) | |
| skip_connections.append(x) | |
| # Decoder pass - process the remaining blocks with weighted skip connections | |
| for i in range(self.num_decoder_layers): | |
| x = x + self.skip_weights[i] * skip_connections.pop() | |
| x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) | |
| x = norm(x) | |
| logits = lm_head_fp8(x, self.lm_head.weight) if self.training else self.lm_head(x) | |
| # @Grad62304977 added tanh softcapping, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
| logits = 30 * torch.sigmoid(logits.float() / 7.5) | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq) | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Our own simple Distributed Data Loader | |
| def _load_data_shard(file: Path): | |
| header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) # header is 256 int32 | |
| assert header[0] == 20240520, 'magic number mismatch in the data .bin file' | |
| assert header[1] == 1, 'unsupported version' | |
| num_tokens = int(header[2]) # number of tokens (claimed) | |
| with file.open('rb', buffering=0) as f: | |
| tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
| f.seek(256 * 4) | |
| nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
| assert nbytes == 2 * num_tokens, 'number of tokens read does not match header' | |
| return tokens | |
| def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int): | |
| files = sorted(Path.cwd().glob(filename_pattern)) | |
| assert batch_size % world_size == 0 | |
| local_batch_size = batch_size // world_size | |
| file_iter = iter(files) # use cycle(files) if you want to do multi-epoch training | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| while True: | |
| if pos + batch_size + 1 >= len(tokens): | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1] | |
| inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
| targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
| pos += batch_size | |
| yield inputs, targets | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data | |
| train_files = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on | |
| val_files = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on | |
| val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| # optimization | |
| batch_size = 8*64*1024 # batch size in tokens | |
| num_iterations = 1395 # number of iterations to run | |
| cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
| # evaluation and logging | |
| val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| # implementation | |
| seq_len = 64*1024 # FlexAttention sequence length | |
| save_checkpoint = False | |
| args = Hyperparameters() | |
| # torchrun sets these env variables | |
| rank = int(os.environ['RANK']) | |
| world_size = int(os.environ['WORLD_SIZE']) | |
| assert torch.cuda.is_available() | |
| device = torch.device('cuda', int(os.environ['LOCAL_RANK'])) | |
| torch.cuda.set_device(device) | |
| dist.init_process_group(backend='nccl', device_id=device) | |
| dist.barrier() | |
| master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = uuid.uuid4() | |
| os.makedirs('logs', exist_ok=True) | |
| logfile = f'logs/{run_id}.txt' | |
| print(logfile) | |
| def print0(s, console=False): | |
| if master_process: | |
| with open(logfile, 'a') as f: | |
| if console: | |
| print(s) | |
| print(s, file=f) | |
| # begin by printing this file (the Python code) | |
| print0(code) | |
| print0('='*100) | |
| # log information about the hardware/software environment this is running on | |
| print0(f'Running Python {sys.version}') | |
| print0(f'Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}') | |
| def nvidia_smi(): | |
| import subprocess # avoid top level import | |
| return subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
| print0(nvidia_smi()) | |
| print0('='*100) | |
| # load data | |
| train_loader = distributed_data_generator(args.train_files, args.batch_size, rank, world_size) | |
| model = GPT(vocab_size=50257, num_layers=12, num_heads=6, model_dim=768).cuda() | |
| for m in model.modules(): | |
| if isinstance(m, nn.Embedding): | |
| m.bfloat16() | |
| for param in model.parameters(): | |
| dist.broadcast(param.detach(), 0) | |
| # collect the parameters to optimize | |
| hidden_matrix_params = [p for p in model.blocks.parameters() if p.ndim == 2] | |
| embed_params = [model.embed.weight, *model.value_embeds.parameters()] | |
| scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
| head_params = [model.lm_head.weight] | |
| # init the optimizer(s) | |
| adam_params = [dict(params=head_params, lr=0.008), dict(params=embed_params, lr=0.6), dict(params=scalar_params, lr=0.04)] | |
| optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), fused=True) | |
| optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size) | |
| optimizers = [optimizer1, optimizer2] | |
| # learning rate schedule: stable then decay | |
| def get_lr(it: int): | |
| t = 1 - it / args.num_iterations # time remaining in training | |
| assert 1 >= t >= 0 | |
| w = min(t / args.cooldown_frac, 1.0) # 1 -> 0 | |
| return w * 1.0 + (1 - w) * 0.1 | |
| schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] | |
| @lru_cache(1) | |
| def sw_num_blks(window_size: int): | |
| return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
| model: nn.Module = torch.compile(model) | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| # begin training | |
| train_steps = args.num_iterations | |
| for step in range(train_steps + 1): | |
| last_step = (step == train_steps) | |
| # This effectively ignores timing first 10 steps, which are slower for weird reasons. | |
| # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 | |
| # steps with dummy data first, and then re-initialize the model and reset the loader. | |
| if step == 10: | |
| training_time_ms = 0 | |
| t0 = time.perf_counter() | |
| timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val | |
| # Linearly increase the block-wise sliding window size over training 128 -> 1792: | |
| # increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
| window_size = next_multiple_of_n(1728 * step / train_steps, n=128) | |
| # --------------- VALIDATION SECTION ----------------- | |
| if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.perf_counter() - t0) | |
| model.eval() | |
| val_bs = world_size * args.seq_len | |
| assert args.val_tokens % val_bs == 0 | |
| val_steps = args.val_tokens // val_bs | |
| val_loader = distributed_data_generator(args.val_files, val_bs, rank, world_size) | |
| val_loss = 0 | |
| with torch.no_grad(): | |
| for _ in range(val_steps): | |
| x, y = next(val_loader) | |
| val_loss += model(x, y, sw_num_blks(window_size)) | |
| val_loss /= val_steps | |
| del val_loader | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| print0(f'step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms', console=True) | |
| model.train() | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| if last_step: | |
| if master_process and args.save_checkpoint: | |
| log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| os.makedirs(f'logs/{run_id}', exist_ok=True) | |
| torch.save(log, f'logs/{run_id}/state_step{step:06d}.pt') | |
| # the last step only has the validation loop, so break to avoid training | |
| break | |
| # --------------- TRAINING SECTION BEGIN ----------------- | |
| inputs, targets = next(train_loader) | |
| for input_seq, target_seq in zip(inputs.split(args.seq_len), targets.split(args.seq_len)): | |
| model(input_seq, target_seq, sw_num_blks(window_size)).backward() | |
| for param in model.parameters(): | |
| dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) | |
| # momentum warmup for Muon | |
| frac = min(step / 300, 1) | |
| for group in optimizer2.param_groups: | |
| group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 | |
| # step the optimizers and schedulers | |
| for opt, sched in zip(optimizers, schedulers): | |
| opt.step() | |
| sched.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # logging | |
| approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) | |
| print0(f'step:{step+1}/{train_steps} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms', console=True) | |
| print0( | |
| f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
| f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" | |
| ) | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running Python 3.12.7 (main, Jan 16 2025, 08:58:39) [GCC 13.2.0] | |
| Running PyTorch 2.7.0.dev20250110+cu126 compiled for CUDA 12.6 | |
| Thu Jan 16 10:26:28 2025 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 550.127.05 Driver Version: 550.127.05 CUDA Version: 12.6 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 On | 00000000:61:00.0 Off | 0 | | |
| | N/A 31C P0 133W / 700W | 7746MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 1 NVIDIA H100 80GB HBM3 On | 00000000:62:00.0 Off | 0 | | |
| | N/A 32C P0 125W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 2 NVIDIA H100 80GB HBM3 On | 00000000:63:00.0 Off | 0 | | |
| | N/A 34C P0 127W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 3 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 | | |
| | N/A 30C P0 116W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 4 NVIDIA H100 80GB HBM3 On | 00000000:6A:00.0 Off | 0 | | |
| | N/A 30C P0 113W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 5 NVIDIA H100 80GB HBM3 On | 00000000:6B:00.0 Off | 0 | | |
| | N/A 35C P0 124W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 6 NVIDIA H100 80GB HBM3 On | 00000000:6C:00.0 Off | 0 | | |
| | N/A 35C P0 126W / 700W | 3456MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 7 NVIDIA H100 80GB HBM3 On | 00000000:6D:00.0 Off | 0 | | |
| | N/A 30C P0 126W / 700W | 3216MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| step:0/1395 val_loss:10.8258 train_time:0ms step_avg:nanms | |
| step:1/1395 train_time:26881ms step_avg:nanms | |
| step:2/1395 train_time:27284ms step_avg:nanms | |
| step:3/1395 train_time:27403ms step_avg:nanms | |
| step:4/1395 train_time:27526ms step_avg:nanms | |
| step:5/1395 train_time:27650ms step_avg:nanms | |
| step:6/1395 train_time:27773ms step_avg:nanms | |
| step:7/1395 train_time:27896ms step_avg:nanms | |
| step:8/1395 train_time:28018ms step_avg:nanms | |
| step:9/1395 train_time:28142ms step_avg:nanms | |
| step:10/1395 train_time:28267ms step_avg:nanms | |
| step:11/1395 train_time:123ms step_avg:nanms | |
| step:12/1395 train_time:247ms step_avg:nanms | |
| step:13/1395 train_time:371ms step_avg:123.56ms | |
| step:14/1395 train_time:495ms step_avg:123.70ms | |
| step:15/1395 train_time:619ms step_avg:123.78ms | |
| step:16/1395 train_time:742ms step_avg:123.65ms | |
| step:17/1395 train_time:866ms step_avg:123.74ms | |
| step:18/1395 train_time:991ms step_avg:123.93ms | |
| step:19/1395 train_time:1117ms step_avg:124.10ms | |
| step:20/1395 train_time:1241ms step_avg:124.07ms | |
| step:21/1395 train_time:1365ms step_avg:124.09ms | |
| step:22/1395 train_time:1490ms step_avg:124.14ms | |
| step:23/1395 train_time:1614ms step_avg:124.17ms | |
| step:24/1395 train_time:1737ms step_avg:124.09ms | |
| step:25/1395 train_time:1860ms step_avg:124.03ms | |
| step:26/1395 train_time:1985ms step_avg:124.09ms | |
| step:27/1395 train_time:2109ms step_avg:124.05ms | |
| step:28/1395 train_time:2233ms step_avg:124.04ms | |
| step:29/1395 train_time:2357ms step_avg:124.03ms | |
| step:30/1395 train_time:2480ms step_avg:124.02ms | |
| step:31/1395 train_time:2603ms step_avg:123.97ms | |
| step:32/1395 train_time:2727ms step_avg:123.95ms | |
| step:33/1395 train_time:2850ms step_avg:123.93ms | |
| step:34/1395 train_time:2975ms step_avg:123.98ms | |
| step:35/1395 train_time:3100ms step_avg:123.98ms | |
| step:36/1395 train_time:3224ms step_avg:123.99ms | |
| step:37/1395 train_time:3349ms step_avg:124.04ms | |
| step:38/1395 train_time:3473ms step_avg:124.03ms | |
| step:39/1395 train_time:3598ms step_avg:124.07ms | |
| step:40/1395 train_time:3723ms step_avg:124.09ms | |
| step:41/1395 train_time:3847ms step_avg:124.08ms | |
| step:42/1395 train_time:3971ms step_avg:124.08ms | |
| step:43/1395 train_time:4095ms step_avg:124.08ms | |
| step:44/1395 train_time:4220ms step_avg:124.11ms | |
| step:45/1395 train_time:4343ms step_avg:124.07ms | |
| step:46/1395 train_time:4469ms step_avg:124.15ms | |
| step:47/1395 train_time:4593ms step_avg:124.15ms | |
| step:48/1395 train_time:4717ms step_avg:124.13ms | |
| step:49/1395 train_time:4842ms step_avg:124.14ms | |
| step:50/1395 train_time:4965ms step_avg:124.12ms | |
| step:51/1395 train_time:5090ms step_avg:124.14ms | |
| step:52/1395 train_time:5215ms step_avg:124.17ms | |
| step:53/1395 train_time:5339ms step_avg:124.16ms | |
| step:54/1395 train_time:5463ms step_avg:124.15ms | |
| step:55/1395 train_time:5587ms step_avg:124.16ms | |
| step:56/1395 train_time:5711ms step_avg:124.16ms | |
| step:57/1395 train_time:5836ms step_avg:124.17ms | |
| step:58/1395 train_time:5960ms step_avg:124.17ms | |
| step:59/1395 train_time:6084ms step_avg:124.17ms | |
| step:60/1395 train_time:6210ms step_avg:124.19ms | |
| step:61/1395 train_time:6333ms step_avg:124.18ms | |
| step:62/1395 train_time:6457ms step_avg:124.18ms | |
| step:63/1395 train_time:6581ms step_avg:124.17ms | |
| step:64/1395 train_time:6705ms step_avg:124.17ms | |
| step:65/1395 train_time:6831ms step_avg:124.20ms | |
| step:66/1395 train_time:6955ms step_avg:124.20ms | |
| step:67/1395 train_time:7080ms step_avg:124.20ms | |
| step:68/1395 train_time:7203ms step_avg:124.20ms | |
| step:69/1395 train_time:7327ms step_avg:124.18ms | |
| step:70/1395 train_time:7452ms step_avg:124.20ms | |
| step:71/1395 train_time:7576ms step_avg:124.20ms | |
| step:72/1395 train_time:7700ms step_avg:124.20ms | |
| step:73/1395 train_time:7824ms step_avg:124.19ms | |
| step:74/1395 train_time:7949ms step_avg:124.20ms | |
| step:75/1395 train_time:8073ms step_avg:124.20ms | |
| step:76/1395 train_time:8198ms step_avg:124.21ms | |
| step:77/1395 train_time:8322ms step_avg:124.21ms | |
| step:78/1395 train_time:8445ms step_avg:124.19ms | |
| step:79/1395 train_time:8569ms step_avg:124.18ms | |
| step:80/1395 train_time:8693ms step_avg:124.18ms | |
| step:81/1395 train_time:8818ms step_avg:124.20ms | |
| step:82/1395 train_time:8941ms step_avg:124.18ms | |
| step:83/1395 train_time:9065ms step_avg:124.17ms | |
| step:84/1395 train_time:9190ms step_avg:124.19ms | |
| step:85/1395 train_time:9314ms step_avg:124.19ms | |
| step:86/1395 train_time:9439ms step_avg:124.19ms | |
| step:87/1395 train_time:9562ms step_avg:124.19ms | |
| step:88/1395 train_time:9688ms step_avg:124.20ms | |
| step:89/1395 train_time:9812ms step_avg:124.21ms | |
| step:90/1395 train_time:9936ms step_avg:124.20ms | |
| step:91/1395 train_time:10059ms step_avg:124.18ms | |
| step:92/1395 train_time:10182ms step_avg:124.18ms | |
| step:93/1395 train_time:10306ms step_avg:124.17ms | |
| step:94/1395 train_time:10431ms step_avg:124.18ms | |
| step:95/1395 train_time:10556ms step_avg:124.19ms | |
| step:96/1395 train_time:10681ms step_avg:124.20ms | |
| step:97/1395 train_time:10806ms step_avg:124.21ms | |
| step:98/1395 train_time:10930ms step_avg:124.21ms | |
| step:99/1395 train_time:11054ms step_avg:124.20ms | |
| step:100/1395 train_time:11178ms step_avg:124.20ms | |
| step:101/1395 train_time:11301ms step_avg:124.19ms | |
| step:102/1395 train_time:11425ms step_avg:124.19ms | |
| step:103/1395 train_time:11551ms step_avg:124.20ms | |
| step:104/1395 train_time:11676ms step_avg:124.22ms | |
| step:105/1395 train_time:11802ms step_avg:124.23ms | |
| step:106/1395 train_time:11928ms step_avg:124.25ms | |
| step:107/1395 train_time:12055ms step_avg:124.28ms | |
| step:108/1395 train_time:12182ms step_avg:124.30ms | |
| step:109/1395 train_time:12308ms step_avg:124.32ms | |
| step:110/1395 train_time:12434ms step_avg:124.34ms | |
| step:111/1395 train_time:12561ms step_avg:124.37ms | |
| step:112/1395 train_time:12690ms step_avg:124.41ms | |
| step:113/1395 train_time:12817ms step_avg:124.44ms | |
| step:114/1395 train_time:12944ms step_avg:124.46ms | |
| step:115/1395 train_time:13071ms step_avg:124.49ms | |
| step:116/1395 train_time:13199ms step_avg:124.51ms | |
| step:117/1395 train_time:13325ms step_avg:124.53ms | |
| step:118/1395 train_time:13451ms step_avg:124.54ms | |
| step:119/1395 train_time:13578ms step_avg:124.57ms | |
| step:120/1395 train_time:13705ms step_avg:124.59ms | |
| step:121/1395 train_time:13833ms step_avg:124.62ms | |
| step:122/1395 train_time:13960ms step_avg:124.64ms | |
| step:123/1395 train_time:14087ms step_avg:124.66ms | |
| step:124/1395 train_time:14214ms step_avg:124.68ms | |
| step:125/1395 train_time:14341ms step_avg:124.70ms | |
| step:125/1395 val_loss:4.3745 train_time:14441ms step_avg:125.57ms | |
| step:126/1395 train_time:14475ms step_avg:124.78ms | |
| step:127/1395 train_time:14609ms step_avg:124.86ms | |
| step:128/1395 train_time:14736ms step_avg:124.88ms | |
| step:129/1395 train_time:14862ms step_avg:124.89ms | |
| step:130/1395 train_time:14988ms step_avg:124.90ms | |
| step:131/1395 train_time:15115ms step_avg:124.91ms | |
| step:132/1395 train_time:15241ms step_avg:124.93ms | |
| step:133/1395 train_time:15368ms step_avg:124.94ms | |
| step:134/1395 train_time:15494ms step_avg:124.95ms | |
| step:135/1395 train_time:15621ms step_avg:124.96ms | |
| step:136/1395 train_time:15748ms step_avg:124.99ms | |
| step:137/1395 train_time:15877ms step_avg:125.02ms | |
| step:138/1395 train_time:16004ms step_avg:125.03ms | |
| step:139/1395 train_time:16130ms step_avg:125.04ms | |
| step:140/1395 train_time:16255ms step_avg:125.04ms | |
| step:141/1395 train_time:16382ms step_avg:125.05ms | |
| step:142/1395 train_time:16508ms step_avg:125.06ms | |
| step:143/1395 train_time:16634ms step_avg:125.07ms | |
| step:144/1395 train_time:16762ms step_avg:125.09ms | |
| step:145/1395 train_time:16890ms step_avg:125.11ms | |
| step:146/1395 train_time:17016ms step_avg:125.12ms | |
| step:147/1395 train_time:17143ms step_avg:125.13ms | |
| step:148/1395 train_time:17270ms step_avg:125.14ms | |
| step:149/1395 train_time:17397ms step_avg:125.16ms | |
| step:150/1395 train_time:17524ms step_avg:125.17ms | |
| step:151/1395 train_time:17650ms step_avg:125.18ms | |
| step:152/1395 train_time:17776ms step_avg:125.19ms | |
| step:153/1395 train_time:17903ms step_avg:125.19ms | |
| step:154/1395 train_time:18029ms step_avg:125.20ms | |
| step:155/1395 train_time:18155ms step_avg:125.21ms | |
| step:156/1395 train_time:18282ms step_avg:125.22ms | |
| step:157/1395 train_time:18409ms step_avg:125.23ms | |
| step:158/1395 train_time:18535ms step_avg:125.24ms | |
| step:159/1395 train_time:18662ms step_avg:125.25ms | |
| step:160/1395 train_time:18789ms step_avg:125.26ms | |
| step:161/1395 train_time:18916ms step_avg:125.27ms | |
| step:162/1395 train_time:19042ms step_avg:125.27ms | |
| step:163/1395 train_time:19169ms step_avg:125.29ms | |
| step:164/1395 train_time:19295ms step_avg:125.29ms | |
| step:165/1395 train_time:19422ms step_avg:125.30ms | |
| step:166/1395 train_time:19549ms step_avg:125.31ms | |
| step:167/1395 train_time:19675ms step_avg:125.32ms | |
| step:168/1395 train_time:19802ms step_avg:125.33ms | |
| step:169/1395 train_time:19928ms step_avg:125.34ms | |
| step:170/1395 train_time:20055ms step_avg:125.34ms | |
| step:171/1395 train_time:20182ms step_avg:125.35ms | |
| step:172/1395 train_time:20309ms step_avg:125.36ms | |
| step:173/1395 train_time:20435ms step_avg:125.37ms | |
| step:174/1395 train_time:20561ms step_avg:125.37ms | |
| step:175/1395 train_time:20688ms step_avg:125.38ms | |
| step:176/1395 train_time:20814ms step_avg:125.39ms | |
| step:177/1395 train_time:20941ms step_avg:125.40ms | |
| step:178/1395 train_time:21068ms step_avg:125.40ms | |
| step:179/1395 train_time:21194ms step_avg:125.41ms | |
| step:180/1395 train_time:21321ms step_avg:125.42ms | |
| step:181/1395 train_time:21448ms step_avg:125.42ms | |
| step:182/1395 train_time:21574ms step_avg:125.43ms | |
| step:183/1395 train_time:21701ms step_avg:125.44ms | |
| step:184/1395 train_time:21827ms step_avg:125.45ms | |
| step:185/1395 train_time:21954ms step_avg:125.45ms | |
| step:186/1395 train_time:22080ms step_avg:125.46ms | |
| step:187/1395 train_time:22208ms step_avg:125.47ms | |
| step:188/1395 train_time:22334ms step_avg:125.47ms | |
| step:189/1395 train_time:22462ms step_avg:125.49ms | |
| step:190/1395 train_time:22589ms step_avg:125.50ms | |
| step:191/1395 train_time:22716ms step_avg:125.50ms | |
| step:192/1395 train_time:22843ms step_avg:125.51ms | |
| step:193/1395 train_time:22971ms step_avg:125.52ms | |
| step:194/1395 train_time:23097ms step_avg:125.53ms | |
| step:195/1395 train_time:23223ms step_avg:125.53ms | |
| step:196/1395 train_time:23349ms step_avg:125.53ms | |
| step:197/1395 train_time:23475ms step_avg:125.53ms | |
| step:198/1395 train_time:23602ms step_avg:125.54ms | |
| step:199/1395 train_time:23728ms step_avg:125.55ms | |
| step:200/1395 train_time:23855ms step_avg:125.55ms | |
| step:201/1395 train_time:23983ms step_avg:125.57ms | |
| step:202/1395 train_time:24111ms step_avg:125.58ms | |
| step:203/1395 train_time:24238ms step_avg:125.59ms | |
| step:204/1395 train_time:24365ms step_avg:125.59ms | |
| step:205/1395 train_time:24492ms step_avg:125.60ms | |
| step:206/1395 train_time:24617ms step_avg:125.60ms | |
| step:207/1395 train_time:24744ms step_avg:125.60ms | |
| step:208/1395 train_time:24871ms step_avg:125.61ms | |
| step:209/1395 train_time:25000ms step_avg:125.63ms | |
| step:210/1395 train_time:25129ms step_avg:125.64ms | |
| step:211/1395 train_time:25257ms step_avg:125.66ms | |
| step:212/1395 train_time:25387ms step_avg:125.68ms | |
| step:213/1395 train_time:25516ms step_avg:125.70ms | |
| step:214/1395 train_time:25645ms step_avg:125.71ms | |
| step:215/1395 train_time:25775ms step_avg:125.73ms | |
| step:216/1395 train_time:25904ms step_avg:125.75ms | |
| step:217/1395 train_time:26032ms step_avg:125.76ms | |
| step:218/1395 train_time:26159ms step_avg:125.76ms | |
| step:219/1395 train_time:26289ms step_avg:125.79ms | |
| step:220/1395 train_time:26418ms step_avg:125.80ms | |
| step:221/1395 train_time:26548ms step_avg:125.82ms | |
| step:222/1395 train_time:26678ms step_avg:125.84ms | |
| step:223/1395 train_time:26807ms step_avg:125.86ms | |
| step:224/1395 train_time:26937ms step_avg:125.87ms | |
| step:225/1395 train_time:27066ms step_avg:125.89ms | |
| step:226/1395 train_time:27196ms step_avg:125.91ms | |
| step:227/1395 train_time:27325ms step_avg:125.92ms | |
| step:228/1395 train_time:27455ms step_avg:125.94ms | |
| step:229/1395 train_time:27584ms step_avg:125.95ms | |
| step:230/1395 train_time:27714ms step_avg:125.97ms | |
| step:231/1395 train_time:27843ms step_avg:125.99ms | |
| step:232/1395 train_time:27972ms step_avg:126.00ms | |
| step:233/1395 train_time:28101ms step_avg:126.02ms | |
| step:234/1395 train_time:28230ms step_avg:126.03ms | |
| step:235/1395 train_time:28359ms step_avg:126.04ms | |
| step:236/1395 train_time:28489ms step_avg:126.06ms | |
| step:237/1395 train_time:28619ms step_avg:126.08ms | |
| step:238/1395 train_time:28748ms step_avg:126.09ms | |
| step:239/1395 train_time:28878ms step_avg:126.10ms | |
| step:240/1395 train_time:29007ms step_avg:126.12ms | |
| step:241/1395 train_time:29136ms step_avg:126.13ms | |
| step:242/1395 train_time:29264ms step_avg:126.14ms | |
| step:243/1395 train_time:29394ms step_avg:126.15ms | |
| step:244/1395 train_time:29523ms step_avg:126.17ms | |
| step:245/1395 train_time:29652ms step_avg:126.18ms | |
| step:246/1395 train_time:29781ms step_avg:126.19ms | |
| step:247/1395 train_time:29911ms step_avg:126.21ms | |
| step:248/1395 train_time:30039ms step_avg:126.21ms | |
| step:249/1395 train_time:30168ms step_avg:126.23ms | |
| step:250/1395 train_time:30299ms step_avg:126.25ms | |
| step:250/1395 val_loss:3.9501 train_time:30403ms step_avg:126.68ms | |
| step:251/1395 train_time:30437ms step_avg:126.29ms | |
| step:252/1395 train_time:30573ms step_avg:126.33ms | |
| step:253/1395 train_time:30701ms step_avg:126.34ms | |
| step:254/1395 train_time:30830ms step_avg:126.35ms | |
| step:255/1395 train_time:30958ms step_avg:126.36ms | |
| step:256/1395 train_time:31086ms step_avg:126.37ms | |
| step:257/1395 train_time:31215ms step_avg:126.38ms | |
| step:258/1395 train_time:31343ms step_avg:126.38ms | |
| step:259/1395 train_time:31473ms step_avg:126.40ms | |
| step:260/1395 train_time:31603ms step_avg:126.41ms | |
| step:261/1395 train_time:31733ms step_avg:126.43ms | |
| step:262/1395 train_time:31863ms step_avg:126.44ms | |
| step:263/1395 train_time:31991ms step_avg:126.45ms | |
| step:264/1395 train_time:32120ms step_avg:126.46ms | |
| step:265/1395 train_time:32249ms step_avg:126.47ms | |
| step:266/1395 train_time:32378ms step_avg:126.47ms | |
| step:267/1395 train_time:32507ms step_avg:126.49ms | |
| step:268/1395 train_time:32637ms step_avg:126.50ms | |
| step:269/1395 train_time:32766ms step_avg:126.51ms | |
| step:270/1395 train_time:32895ms step_avg:126.52ms | |
| step:271/1395 train_time:33024ms step_avg:126.53ms | |
| step:272/1395 train_time:33154ms step_avg:126.54ms | |
| step:273/1395 train_time:33283ms step_avg:126.55ms | |
| step:274/1395 train_time:33412ms step_avg:126.56ms | |
| step:275/1395 train_time:33541ms step_avg:126.57ms | |
| step:276/1395 train_time:33671ms step_avg:126.58ms | |
| step:277/1395 train_time:33799ms step_avg:126.59ms | |
| step:278/1395 train_time:33929ms step_avg:126.60ms | |
| step:279/1395 train_time:34058ms step_avg:126.61ms | |
| step:280/1395 train_time:34186ms step_avg:126.62ms | |
| step:281/1395 train_time:34316ms step_avg:126.63ms | |
| step:282/1395 train_time:34446ms step_avg:126.64ms | |
| step:283/1395 train_time:34575ms step_avg:126.65ms | |
| step:284/1395 train_time:34706ms step_avg:126.66ms | |
| step:285/1395 train_time:34836ms step_avg:126.68ms | |
| step:286/1395 train_time:34965ms step_avg:126.68ms | |
| step:287/1395 train_time:35093ms step_avg:126.69ms | |
| step:288/1395 train_time:35222ms step_avg:126.70ms | |
| step:289/1395 train_time:35352ms step_avg:126.71ms | |
| step:290/1395 train_time:35480ms step_avg:126.72ms | |
| step:291/1395 train_time:35611ms step_avg:126.73ms | |
| step:292/1395 train_time:35741ms step_avg:126.74ms | |
| step:293/1395 train_time:35872ms step_avg:126.75ms | |
| step:294/1395 train_time:36001ms step_avg:126.77ms | |
| step:295/1395 train_time:36131ms step_avg:126.78ms | |
| step:296/1395 train_time:36261ms step_avg:126.79ms | |
| step:297/1395 train_time:36390ms step_avg:126.79ms | |
| step:298/1395 train_time:36518ms step_avg:126.80ms | |
| step:299/1395 train_time:36648ms step_avg:126.81ms | |
| step:300/1395 train_time:36778ms step_avg:126.82ms | |
| step:301/1395 train_time:36908ms step_avg:126.83ms | |
| step:302/1395 train_time:37039ms step_avg:126.85ms | |
| step:303/1395 train_time:37169ms step_avg:126.85ms | |
| step:304/1395 train_time:37298ms step_avg:126.86ms | |
| step:305/1395 train_time:37427ms step_avg:126.87ms | |
| step:306/1395 train_time:37558ms step_avg:126.88ms | |
| step:307/1395 train_time:37686ms step_avg:126.89ms | |
| step:308/1395 train_time:37817ms step_avg:126.90ms | |
| step:309/1395 train_time:37947ms step_avg:126.91ms | |
| step:310/1395 train_time:38077ms step_avg:126.92ms | |
| step:311/1395 train_time:38206ms step_avg:126.93ms | |
| step:312/1395 train_time:38336ms step_avg:126.94ms | |
| step:313/1395 train_time:38468ms step_avg:126.96ms | |
| step:314/1395 train_time:38600ms step_avg:126.97ms | |
| step:315/1395 train_time:38730ms step_avg:126.98ms | |
| step:316/1395 train_time:38862ms step_avg:127.00ms | |
| step:317/1395 train_time:38993ms step_avg:127.01ms | |
| step:318/1395 train_time:39123ms step_avg:127.02ms | |
| step:319/1395 train_time:39257ms step_avg:127.04ms | |
| step:320/1395 train_time:39388ms step_avg:127.06ms | |
| step:321/1395 train_time:39519ms step_avg:127.07ms | |
| step:322/1395 train_time:39650ms step_avg:127.08ms | |
| step:323/1395 train_time:39780ms step_avg:127.09ms | |
| step:324/1395 train_time:39912ms step_avg:127.11ms | |
| step:325/1395 train_time:40043ms step_avg:127.12ms | |
| step:326/1395 train_time:40175ms step_avg:127.14ms | |
| step:327/1395 train_time:40306ms step_avg:127.15ms | |
| step:328/1395 train_time:40438ms step_avg:127.16ms | |
| step:329/1395 train_time:40571ms step_avg:127.18ms | |
| step:330/1395 train_time:40701ms step_avg:127.19ms | |
| step:331/1395 train_time:40833ms step_avg:127.21ms | |
| step:332/1395 train_time:40966ms step_avg:127.22ms | |
| step:333/1395 train_time:41097ms step_avg:127.23ms | |
| step:334/1395 train_time:41228ms step_avg:127.25ms | |
| step:335/1395 train_time:41359ms step_avg:127.26ms | |
| step:336/1395 train_time:41490ms step_avg:127.27ms | |
| step:337/1395 train_time:41620ms step_avg:127.28ms | |
| step:338/1395 train_time:41754ms step_avg:127.30ms | |
| step:339/1395 train_time:41884ms step_avg:127.31ms | |
| step:340/1395 train_time:42016ms step_avg:127.32ms | |
| step:341/1395 train_time:42148ms step_avg:127.34ms | |
| step:342/1395 train_time:42279ms step_avg:127.35ms | |
| step:343/1395 train_time:42410ms step_avg:127.36ms | |
| step:344/1395 train_time:42542ms step_avg:127.37ms | |
| step:345/1395 train_time:42673ms step_avg:127.38ms | |
| step:346/1395 train_time:42804ms step_avg:127.39ms | |
| step:347/1395 train_time:42937ms step_avg:127.41ms | |
| step:348/1395 train_time:43069ms step_avg:127.42ms | |
| step:349/1395 train_time:43200ms step_avg:127.43ms | |
| step:350/1395 train_time:43331ms step_avg:127.44ms | |
| step:351/1395 train_time:43462ms step_avg:127.45ms | |
| step:352/1395 train_time:43593ms step_avg:127.46ms | |
| step:353/1395 train_time:43724ms step_avg:127.48ms | |
| step:354/1395 train_time:43857ms step_avg:127.49ms | |
| step:355/1395 train_time:43990ms step_avg:127.51ms | |
| step:356/1395 train_time:44122ms step_avg:127.52ms | |
| step:357/1395 train_time:44254ms step_avg:127.53ms | |
| step:358/1395 train_time:44385ms step_avg:127.54ms | |
| step:359/1395 train_time:44517ms step_avg:127.56ms | |
| step:360/1395 train_time:44650ms step_avg:127.57ms | |
| step:361/1395 train_time:44781ms step_avg:127.58ms | |
| step:362/1395 train_time:44914ms step_avg:127.60ms | |
| step:363/1395 train_time:45045ms step_avg:127.61ms | |
| step:364/1395 train_time:45178ms step_avg:127.62ms | |
| step:365/1395 train_time:45310ms step_avg:127.63ms | |
| step:366/1395 train_time:45441ms step_avg:127.64ms | |
| step:367/1395 train_time:45573ms step_avg:127.66ms | |
| step:368/1395 train_time:45703ms step_avg:127.66ms | |
| step:369/1395 train_time:45834ms step_avg:127.67ms | |
| step:370/1395 train_time:45966ms step_avg:127.68ms | |
| step:371/1395 train_time:46098ms step_avg:127.70ms | |
| step:372/1395 train_time:46229ms step_avg:127.71ms | |
| step:373/1395 train_time:46360ms step_avg:127.71ms | |
| step:374/1395 train_time:46491ms step_avg:127.72ms | |
| step:375/1395 train_time:46622ms step_avg:127.73ms | |
| step:375/1395 val_loss:3.7704 train_time:46727ms step_avg:128.02ms | |
| step:376/1395 train_time:46761ms step_avg:127.76ms | |
| step:377/1395 train_time:46896ms step_avg:127.78ms | |
| step:378/1395 train_time:47027ms step_avg:127.79ms | |
| step:379/1395 train_time:47158ms step_avg:127.80ms | |
| step:380/1395 train_time:47289ms step_avg:127.81ms | |
| step:381/1395 train_time:47421ms step_avg:127.82ms | |
| step:382/1395 train_time:47552ms step_avg:127.83ms | |
| step:383/1395 train_time:47684ms step_avg:127.84ms | |
| step:384/1395 train_time:47818ms step_avg:127.86ms | |
| step:385/1395 train_time:47950ms step_avg:127.87ms | |
| step:386/1395 train_time:48082ms step_avg:127.88ms | |
| step:387/1395 train_time:48214ms step_avg:127.89ms | |
| step:388/1395 train_time:48344ms step_avg:127.89ms | |
| step:389/1395 train_time:48475ms step_avg:127.90ms | |
| step:390/1395 train_time:48606ms step_avg:127.91ms | |
| step:391/1395 train_time:48736ms step_avg:127.92ms | |
| step:392/1395 train_time:48868ms step_avg:127.93ms | |
| step:393/1395 train_time:49001ms step_avg:127.94ms | |
| step:394/1395 train_time:49133ms step_avg:127.95ms | |
| step:395/1395 train_time:49265ms step_avg:127.96ms | |
| step:396/1395 train_time:49397ms step_avg:127.97ms | |
| step:397/1395 train_time:49528ms step_avg:127.98ms | |
| step:398/1395 train_time:49661ms step_avg:127.99ms | |
| step:399/1395 train_time:49793ms step_avg:128.00ms | |
| step:400/1395 train_time:49925ms step_avg:128.01ms | |
| step:401/1395 train_time:50056ms step_avg:128.02ms | |
| step:402/1395 train_time:50188ms step_avg:128.03ms | |
| step:403/1395 train_time:50320ms step_avg:128.04ms | |
| step:404/1395 train_time:50451ms step_avg:128.05ms | |
| step:405/1395 train_time:50582ms step_avg:128.06ms | |
| step:406/1395 train_time:50714ms step_avg:128.07ms | |
| step:407/1395 train_time:50846ms step_avg:128.08ms | |
| step:408/1395 train_time:50979ms step_avg:128.09ms | |
| step:409/1395 train_time:51108ms step_avg:128.09ms | |
| step:410/1395 train_time:51242ms step_avg:128.10ms | |
| step:411/1395 train_time:51375ms step_avg:128.12ms | |
| step:412/1395 train_time:51506ms step_avg:128.13ms | |
| step:413/1395 train_time:51638ms step_avg:128.13ms | |
| step:414/1395 train_time:51769ms step_avg:128.14ms | |
| step:415/1395 train_time:51903ms step_avg:128.16ms | |
| step:416/1395 train_time:52036ms step_avg:128.17ms | |
| step:417/1395 train_time:52168ms step_avg:128.18ms | |
| step:418/1395 train_time:52302ms step_avg:128.19ms | |
| step:419/1395 train_time:52435ms step_avg:128.20ms | |
| step:420/1395 train_time:52567ms step_avg:128.21ms | |
| step:421/1395 train_time:52702ms step_avg:128.23ms | |
| step:422/1395 train_time:52834ms step_avg:128.24ms | |
| step:423/1395 train_time:52967ms step_avg:128.25ms | |
| step:424/1395 train_time:53101ms step_avg:128.26ms | |
| step:425/1395 train_time:53233ms step_avg:128.27ms | |
| step:426/1395 train_time:53367ms step_avg:128.29ms | |
| step:427/1395 train_time:53500ms step_avg:128.30ms | |
| step:428/1395 train_time:53632ms step_avg:128.31ms | |
| step:429/1395 train_time:53766ms step_avg:128.32ms | |
| step:430/1395 train_time:53900ms step_avg:128.33ms | |
| step:431/1395 train_time:54033ms step_avg:128.34ms | |
| step:432/1395 train_time:54167ms step_avg:128.36ms | |
| step:433/1395 train_time:54301ms step_avg:128.37ms | |
| step:434/1395 train_time:54433ms step_avg:128.38ms | |
| step:435/1395 train_time:54567ms step_avg:128.39ms | |
| step:436/1395 train_time:54702ms step_avg:128.41ms | |
| step:437/1395 train_time:54835ms step_avg:128.42ms | |
| step:438/1395 train_time:54967ms step_avg:128.43ms | |
| step:439/1395 train_time:55101ms step_avg:128.44ms | |
| step:440/1395 train_time:55233ms step_avg:128.45ms | |
| step:441/1395 train_time:55366ms step_avg:128.46ms | |
| step:442/1395 train_time:55499ms step_avg:128.47ms | |
| step:443/1395 train_time:55631ms step_avg:128.48ms | |
| step:444/1395 train_time:55765ms step_avg:128.49ms | |
| step:445/1395 train_time:55898ms step_avg:128.50ms | |
| step:446/1395 train_time:56031ms step_avg:128.51ms | |
| step:447/1395 train_time:56166ms step_avg:128.53ms | |
| step:448/1395 train_time:56300ms step_avg:128.54ms | |
| step:449/1395 train_time:56433ms step_avg:128.55ms | |
| step:450/1395 train_time:56567ms step_avg:128.56ms | |
| step:451/1395 train_time:56701ms step_avg:128.57ms | |
| step:452/1395 train_time:56833ms step_avg:128.58ms | |
| step:453/1395 train_time:56967ms step_avg:128.59ms | |
| step:454/1395 train_time:57100ms step_avg:128.60ms | |
| step:455/1395 train_time:57233ms step_avg:128.61ms | |
| step:456/1395 train_time:57367ms step_avg:128.63ms | |
| step:457/1395 train_time:57500ms step_avg:128.64ms | |
| step:458/1395 train_time:57633ms step_avg:128.65ms | |
| step:459/1395 train_time:57767ms step_avg:128.66ms | |
| step:460/1395 train_time:57901ms step_avg:128.67ms | |
| step:461/1395 train_time:58034ms step_avg:128.68ms | |
| step:462/1395 train_time:58167ms step_avg:128.69ms | |
| step:463/1395 train_time:58301ms step_avg:128.70ms | |
| step:464/1395 train_time:58434ms step_avg:128.71ms | |
| step:465/1395 train_time:58567ms step_avg:128.72ms | |
| step:466/1395 train_time:58701ms step_avg:128.73ms | |
| step:467/1395 train_time:58834ms step_avg:128.74ms | |
| step:468/1395 train_time:58967ms step_avg:128.75ms | |
| step:469/1395 train_time:59102ms step_avg:128.76ms | |
| step:470/1395 train_time:59236ms step_avg:128.77ms | |
| step:471/1395 train_time:59368ms step_avg:128.78ms | |
| step:472/1395 train_time:59503ms step_avg:128.79ms | |
| step:473/1395 train_time:59636ms step_avg:128.80ms | |
| step:474/1395 train_time:59770ms step_avg:128.81ms | |
| step:475/1395 train_time:59905ms step_avg:128.83ms | |
| step:476/1395 train_time:60039ms step_avg:128.84ms | |
| step:477/1395 train_time:60172ms step_avg:128.85ms | |
| step:478/1395 train_time:60306ms step_avg:128.86ms | |
| step:479/1395 train_time:60437ms step_avg:128.86ms | |
| step:480/1395 train_time:60569ms step_avg:128.87ms | |
| step:481/1395 train_time:60703ms step_avg:128.88ms | |
| step:482/1395 train_time:60836ms step_avg:128.89ms | |
| step:483/1395 train_time:60970ms step_avg:128.90ms | |
| step:484/1395 train_time:61105ms step_avg:128.91ms | |
| step:485/1395 train_time:61240ms step_avg:128.93ms | |
| step:486/1395 train_time:61374ms step_avg:128.94ms | |
| step:487/1395 train_time:61508ms step_avg:128.95ms | |
| step:488/1395 train_time:61640ms step_avg:128.95ms | |
| step:489/1395 train_time:61772ms step_avg:128.96ms | |
| step:490/1395 train_time:61906ms step_avg:128.97ms | |
| step:491/1395 train_time:62040ms step_avg:128.98ms | |
| step:492/1395 train_time:62172ms step_avg:128.99ms | |
| step:493/1395 train_time:62306ms step_avg:129.00ms | |
| step:494/1395 train_time:62439ms step_avg:129.01ms | |
| step:495/1395 train_time:62571ms step_avg:129.01ms | |
| step:496/1395 train_time:62706ms step_avg:129.02ms | |
| step:497/1395 train_time:62837ms step_avg:129.03ms | |
| step:498/1395 train_time:62970ms step_avg:129.04ms | |
| step:499/1395 train_time:63103ms step_avg:129.05ms | |
| step:500/1395 train_time:63235ms step_avg:129.05ms | |
| step:500/1395 val_loss:3.6542 train_time:63342ms step_avg:129.27ms | |
| step:501/1395 train_time:63376ms step_avg:129.07ms | |
| step:502/1395 train_time:63513ms step_avg:129.09ms | |
| step:503/1395 train_time:63647ms step_avg:129.10ms | |
| step:504/1395 train_time:63779ms step_avg:129.11ms | |
| step:505/1395 train_time:63912ms step_avg:129.11ms | |
| step:506/1395 train_time:64044ms step_avg:129.12ms | |
| step:507/1395 train_time:64176ms step_avg:129.13ms | |
| step:508/1395 train_time:64310ms step_avg:129.14ms | |
| step:509/1395 train_time:64444ms step_avg:129.15ms | |
| step:510/1395 train_time:64579ms step_avg:129.16ms | |
| step:511/1395 train_time:64711ms step_avg:129.16ms | |
| step:512/1395 train_time:64845ms step_avg:129.17ms | |
| step:513/1395 train_time:64978ms step_avg:129.18ms | |
| step:514/1395 train_time:65110ms step_avg:129.19ms | |
| step:515/1395 train_time:65242ms step_avg:129.19ms | |
| step:516/1395 train_time:65377ms step_avg:129.20ms | |
| step:517/1395 train_time:65512ms step_avg:129.22ms | |
| step:518/1395 train_time:65646ms step_avg:129.22ms | |
| step:519/1395 train_time:65780ms step_avg:129.23ms | |
| step:520/1395 train_time:65916ms step_avg:129.25ms | |
| step:521/1395 train_time:66051ms step_avg:129.26ms | |
| step:522/1395 train_time:66185ms step_avg:129.27ms | |
| step:523/1395 train_time:66320ms step_avg:129.28ms | |
| step:524/1395 train_time:66455ms step_avg:129.29ms | |
| step:525/1395 train_time:66590ms step_avg:129.30ms | |
| step:526/1395 train_time:66724ms step_avg:129.31ms | |
| step:527/1395 train_time:66859ms step_avg:129.32ms | |
| step:528/1395 train_time:66993ms step_avg:129.33ms | |
| step:529/1395 train_time:67128ms step_avg:129.34ms | |
| step:530/1395 train_time:67262ms step_avg:129.35ms | |
| step:531/1395 train_time:67398ms step_avg:129.36ms | |
| step:532/1395 train_time:67535ms step_avg:129.38ms | |
| step:533/1395 train_time:67670ms step_avg:129.39ms | |
| step:534/1395 train_time:67806ms step_avg:129.40ms | |
| step:535/1395 train_time:67941ms step_avg:129.41ms | |
| step:536/1395 train_time:68078ms step_avg:129.43ms | |
| step:537/1395 train_time:68214ms step_avg:129.44ms | |
| step:538/1395 train_time:68348ms step_avg:129.45ms | |
| step:539/1395 train_time:68484ms step_avg:129.46ms | |
| step:540/1395 train_time:68619ms step_avg:129.47ms | |
| step:541/1395 train_time:68755ms step_avg:129.48ms | |
| step:542/1395 train_time:68892ms step_avg:129.50ms | |
| step:543/1395 train_time:69026ms step_avg:129.50ms | |
| step:544/1395 train_time:69160ms step_avg:129.51ms | |
| step:545/1395 train_time:69294ms step_avg:129.52ms | |
| step:546/1395 train_time:69428ms step_avg:129.53ms | |
| step:547/1395 train_time:69562ms step_avg:129.54ms | |
| step:548/1395 train_time:69698ms step_avg:129.55ms | |
| step:549/1395 train_time:69834ms step_avg:129.56ms | |
| step:550/1395 train_time:69967ms step_avg:129.57ms | |
| step:551/1395 train_time:70102ms step_avg:129.58ms | |
| step:552/1395 train_time:70238ms step_avg:129.59ms | |
| step:553/1395 train_time:70372ms step_avg:129.60ms | |
| step:554/1395 train_time:70507ms step_avg:129.61ms | |
| step:555/1395 train_time:70642ms step_avg:129.62ms | |
| step:556/1395 train_time:70778ms step_avg:129.63ms | |
| step:557/1395 train_time:70914ms step_avg:129.64ms | |
| step:558/1395 train_time:71048ms step_avg:129.65ms | |
| step:559/1395 train_time:71182ms step_avg:129.66ms | |
| step:560/1395 train_time:71318ms step_avg:129.67ms | |
| step:561/1395 train_time:71454ms step_avg:129.68ms | |
| step:562/1395 train_time:71589ms step_avg:129.69ms | |
| step:563/1395 train_time:71722ms step_avg:129.70ms | |
| step:564/1395 train_time:71858ms step_avg:129.71ms | |
| step:565/1395 train_time:71992ms step_avg:129.72ms | |
| step:566/1395 train_time:72128ms step_avg:129.73ms | |
| step:567/1395 train_time:72262ms step_avg:129.73ms | |
| step:568/1395 train_time:72398ms step_avg:129.75ms | |
| step:569/1395 train_time:72536ms step_avg:129.76ms | |
| step:570/1395 train_time:72670ms step_avg:129.77ms | |
| step:571/1395 train_time:72804ms step_avg:129.78ms | |
| step:572/1395 train_time:72941ms step_avg:129.79ms | |
| step:573/1395 train_time:73076ms step_avg:129.80ms | |
| step:574/1395 train_time:73212ms step_avg:129.81ms | |
| step:575/1395 train_time:73346ms step_avg:129.82ms | |
| step:576/1395 train_time:73481ms step_avg:129.83ms | |
| step:577/1395 train_time:73617ms step_avg:129.84ms | |
| step:578/1395 train_time:73751ms step_avg:129.84ms | |
| step:579/1395 train_time:73886ms step_avg:129.85ms | |
| step:580/1395 train_time:74020ms step_avg:129.86ms | |
| step:581/1395 train_time:74155ms step_avg:129.87ms | |
| step:582/1395 train_time:74289ms step_avg:129.88ms | |
| step:583/1395 train_time:74424ms step_avg:129.88ms | |
| step:584/1395 train_time:74560ms step_avg:129.90ms | |
| step:585/1395 train_time:74694ms step_avg:129.90ms | |
| step:586/1395 train_time:74832ms step_avg:129.92ms | |
| step:587/1395 train_time:74964ms step_avg:129.92ms | |
| step:588/1395 train_time:75100ms step_avg:129.93ms | |
| step:589/1395 train_time:75235ms step_avg:129.94ms | |
| step:590/1395 train_time:75372ms step_avg:129.95ms | |
| step:591/1395 train_time:75506ms step_avg:129.96ms | |
| step:592/1395 train_time:75642ms step_avg:129.97ms | |
| step:593/1395 train_time:75777ms step_avg:129.98ms | |
| step:594/1395 train_time:75913ms step_avg:129.99ms | |
| step:595/1395 train_time:76051ms step_avg:130.00ms | |
| step:596/1395 train_time:76184ms step_avg:130.01ms | |
| step:597/1395 train_time:76319ms step_avg:130.01ms | |
| step:598/1395 train_time:76455ms step_avg:130.03ms | |
| step:599/1395 train_time:76589ms step_avg:130.03ms | |
| step:600/1395 train_time:76724ms step_avg:130.04ms | |
| step:601/1395 train_time:76860ms step_avg:130.05ms | |
| step:602/1395 train_time:76997ms step_avg:130.06ms | |
| step:603/1395 train_time:77135ms step_avg:130.08ms | |
| step:604/1395 train_time:77270ms step_avg:130.08ms | |
| step:605/1395 train_time:77404ms step_avg:130.09ms | |
| step:606/1395 train_time:77540ms step_avg:130.10ms | |
| step:607/1395 train_time:77675ms step_avg:130.11ms | |
| step:608/1395 train_time:77809ms step_avg:130.12ms | |
| step:609/1395 train_time:77943ms step_avg:130.12ms | |
| step:610/1395 train_time:78079ms step_avg:130.13ms | |
| step:611/1395 train_time:78214ms step_avg:130.14ms | |
| step:612/1395 train_time:78351ms step_avg:130.15ms | |
| step:613/1395 train_time:78485ms step_avg:130.16ms | |
| step:614/1395 train_time:78620ms step_avg:130.17ms | |
| step:615/1395 train_time:78754ms step_avg:130.17ms | |
| step:616/1395 train_time:78888ms step_avg:130.18ms | |
| step:617/1395 train_time:79023ms step_avg:130.19ms | |
| step:618/1395 train_time:79159ms step_avg:130.20ms | |
| step:619/1395 train_time:79295ms step_avg:130.20ms | |
| step:620/1395 train_time:79430ms step_avg:130.21ms | |
| step:621/1395 train_time:79565ms step_avg:130.22ms | |
| step:622/1395 train_time:79702ms step_avg:130.23ms | |
| step:623/1395 train_time:79840ms step_avg:130.25ms | |
| step:624/1395 train_time:79977ms step_avg:130.26ms | |
| step:625/1395 train_time:80113ms step_avg:130.27ms | |
| step:625/1395 val_loss:3.5718 train_time:80223ms step_avg:130.44ms | |
| step:626/1395 train_time:80257ms step_avg:130.29ms | |
| step:627/1395 train_time:80395ms step_avg:130.30ms | |
| step:628/1395 train_time:80532ms step_avg:130.31ms | |
| step:629/1395 train_time:80670ms step_avg:130.32ms | |
| step:630/1395 train_time:80805ms step_avg:130.33ms | |
| step:631/1395 train_time:80940ms step_avg:130.34ms | |
| step:632/1395 train_time:81076ms step_avg:130.35ms | |
| step:633/1395 train_time:81213ms step_avg:130.36ms | |
| step:634/1395 train_time:81354ms step_avg:130.38ms | |
| step:635/1395 train_time:81491ms step_avg:130.39ms | |
| step:636/1395 train_time:81628ms step_avg:130.40ms | |
| step:637/1395 train_time:81762ms step_avg:130.40ms | |
| step:638/1395 train_time:81897ms step_avg:130.41ms | |
| step:639/1395 train_time:82034ms step_avg:130.42ms | |
| step:640/1395 train_time:82171ms step_avg:130.43ms | |
| step:641/1395 train_time:82308ms step_avg:130.44ms | |
| step:642/1395 train_time:82443ms step_avg:130.45ms | |
| step:643/1395 train_time:82579ms step_avg:130.46ms | |
| step:644/1395 train_time:82714ms step_avg:130.46ms | |
| step:645/1395 train_time:82849ms step_avg:130.47ms | |
| step:646/1395 train_time:82984ms step_avg:130.48ms | |
| step:647/1395 train_time:83119ms step_avg:130.48ms | |
| step:648/1395 train_time:83257ms step_avg:130.50ms | |
| step:649/1395 train_time:83392ms step_avg:130.50ms | |
| step:650/1395 train_time:83530ms step_avg:130.52ms | |
| step:651/1395 train_time:83668ms step_avg:130.53ms | |
| step:652/1395 train_time:83804ms step_avg:130.54ms | |
| step:653/1395 train_time:83939ms step_avg:130.54ms | |
| step:654/1395 train_time:84077ms step_avg:130.55ms | |
| step:655/1395 train_time:84213ms step_avg:130.56ms | |
| step:656/1395 train_time:84350ms step_avg:130.57ms | |
| step:657/1395 train_time:84487ms step_avg:130.58ms | |
| step:658/1395 train_time:84623ms step_avg:130.59ms | |
| step:659/1395 train_time:84760ms step_avg:130.60ms | |
| step:660/1395 train_time:84895ms step_avg:130.61ms | |
| step:661/1395 train_time:85034ms step_avg:130.62ms | |
| step:662/1395 train_time:85171ms step_avg:130.63ms | |
| step:663/1395 train_time:85308ms step_avg:130.64ms | |
| step:664/1395 train_time:85444ms step_avg:130.65ms | |
| step:665/1395 train_time:85579ms step_avg:130.65ms | |
| step:666/1395 train_time:85715ms step_avg:130.66ms | |
| step:667/1395 train_time:85853ms step_avg:130.67ms | |
| step:668/1395 train_time:85989ms step_avg:130.68ms | |
| step:669/1395 train_time:86127ms step_avg:130.69ms | |
| step:670/1395 train_time:86261ms step_avg:130.70ms | |
| step:671/1395 train_time:86397ms step_avg:130.71ms | |
| step:672/1395 train_time:86535ms step_avg:130.72ms | |
| step:673/1395 train_time:86671ms step_avg:130.73ms | |
| step:674/1395 train_time:86806ms step_avg:130.73ms | |
| step:675/1395 train_time:86942ms step_avg:130.74ms | |
| step:676/1395 train_time:87078ms step_avg:130.75ms | |
| step:677/1395 train_time:87216ms step_avg:130.76ms | |
| step:678/1395 train_time:87353ms step_avg:130.77ms | |
| step:679/1395 train_time:87490ms step_avg:130.78ms | |
| step:680/1395 train_time:87627ms step_avg:130.79ms | |
| step:681/1395 train_time:87762ms step_avg:130.79ms | |
| step:682/1395 train_time:87898ms step_avg:130.80ms | |
| step:683/1395 train_time:88037ms step_avg:130.81ms | |
| step:684/1395 train_time:88175ms step_avg:130.82ms | |
| step:685/1395 train_time:88313ms step_avg:130.83ms | |
| step:686/1395 train_time:88450ms step_avg:130.84ms | |
| step:687/1395 train_time:88587ms step_avg:130.85ms | |
| step:688/1395 train_time:88722ms step_avg:130.86ms | |
| step:689/1395 train_time:88860ms step_avg:130.87ms | |
| step:690/1395 train_time:88996ms step_avg:130.88ms | |
| step:691/1395 train_time:89132ms step_avg:130.88ms | |
| step:692/1395 train_time:89269ms step_avg:130.89ms | |
| step:693/1395 train_time:89405ms step_avg:130.90ms | |
| step:694/1395 train_time:89542ms step_avg:130.91ms | |
| step:695/1395 train_time:89677ms step_avg:130.91ms | |
| step:696/1395 train_time:89813ms step_avg:130.92ms | |
| step:697/1395 train_time:89949ms step_avg:130.93ms | |
| step:698/1395 train_time:90084ms step_avg:130.94ms | |
| step:699/1395 train_time:90223ms step_avg:130.95ms | |
| step:700/1395 train_time:90358ms step_avg:130.95ms | |
| step:701/1395 train_time:90497ms step_avg:130.96ms | |
| step:702/1395 train_time:90635ms step_avg:130.98ms | |
| step:703/1395 train_time:90771ms step_avg:130.98ms | |
| step:704/1395 train_time:90907ms step_avg:130.99ms | |
| step:705/1395 train_time:91042ms step_avg:131.00ms | |
| step:706/1395 train_time:91177ms step_avg:131.00ms | |
| step:707/1395 train_time:91313ms step_avg:131.01ms | |
| step:708/1395 train_time:91452ms step_avg:131.02ms | |
| step:709/1395 train_time:91591ms step_avg:131.03ms | |
| step:710/1395 train_time:91727ms step_avg:131.04ms | |
| step:711/1395 train_time:91862ms step_avg:131.04ms | |
| step:712/1395 train_time:92000ms step_avg:131.05ms | |
| step:713/1395 train_time:92137ms step_avg:131.06ms | |
| step:714/1395 train_time:92273ms step_avg:131.07ms | |
| step:715/1395 train_time:92408ms step_avg:131.08ms | |
| step:716/1395 train_time:92545ms step_avg:131.08ms | |
| step:717/1395 train_time:92681ms step_avg:131.09ms | |
| step:718/1395 train_time:92817ms step_avg:131.10ms | |
| step:719/1395 train_time:92955ms step_avg:131.11ms | |
| step:720/1395 train_time:93092ms step_avg:131.12ms | |
| step:721/1395 train_time:93229ms step_avg:131.12ms | |
| step:722/1395 train_time:93364ms step_avg:131.13ms | |
| step:723/1395 train_time:93499ms step_avg:131.13ms | |
| step:724/1395 train_time:93637ms step_avg:131.14ms | |
| step:725/1395 train_time:93774ms step_avg:131.15ms | |
| step:726/1395 train_time:93912ms step_avg:131.16ms | |
| step:727/1395 train_time:94053ms step_avg:131.18ms | |
| step:728/1395 train_time:94189ms step_avg:131.18ms | |
| step:729/1395 train_time:94326ms step_avg:131.19ms | |
| step:730/1395 train_time:94464ms step_avg:131.20ms | |
| step:731/1395 train_time:94601ms step_avg:131.21ms | |
| step:732/1395 train_time:94737ms step_avg:131.22ms | |
| step:733/1395 train_time:94877ms step_avg:131.23ms | |
| step:734/1395 train_time:95014ms step_avg:131.24ms | |
| step:735/1395 train_time:95151ms step_avg:131.24ms | |
| step:736/1395 train_time:95288ms step_avg:131.25ms | |
| step:737/1395 train_time:95429ms step_avg:131.26ms | |
| step:738/1395 train_time:95569ms step_avg:131.28ms | |
| step:739/1395 train_time:95706ms step_avg:131.28ms | |
| step:740/1395 train_time:95846ms step_avg:131.30ms | |
| step:741/1395 train_time:95985ms step_avg:131.31ms | |
| step:742/1395 train_time:96125ms step_avg:131.32ms | |
| step:743/1395 train_time:96263ms step_avg:131.33ms | |
| step:744/1395 train_time:96399ms step_avg:131.33ms | |
| step:745/1395 train_time:96538ms step_avg:131.34ms | |
| step:746/1395 train_time:96674ms step_avg:131.35ms | |
| step:747/1395 train_time:96813ms step_avg:131.36ms | |
| step:748/1395 train_time:96952ms step_avg:131.37ms | |
| step:749/1395 train_time:97092ms step_avg:131.38ms | |
| step:750/1395 train_time:97229ms step_avg:131.39ms | |
| step:750/1395 val_loss:3.5197 train_time:97342ms step_avg:131.54ms | |
| step:751/1395 train_time:97376ms step_avg:131.41ms | |
| step:752/1395 train_time:97515ms step_avg:131.42ms | |
| step:753/1395 train_time:97652ms step_avg:131.43ms | |
| step:754/1395 train_time:97789ms step_avg:131.44ms | |
| step:755/1395 train_time:97928ms step_avg:131.45ms | |
| step:756/1395 train_time:98066ms step_avg:131.46ms | |
| step:757/1395 train_time:98205ms step_avg:131.47ms | |
| step:758/1395 train_time:98341ms step_avg:131.47ms | |
| step:759/1395 train_time:98479ms step_avg:131.48ms | |
| step:760/1395 train_time:98617ms step_avg:131.49ms | |
| step:761/1395 train_time:98756ms step_avg:131.50ms | |
| step:762/1395 train_time:98894ms step_avg:131.51ms | |
| step:763/1395 train_time:99031ms step_avg:131.51ms | |
| step:764/1395 train_time:99169ms step_avg:131.52ms | |
| step:765/1395 train_time:99307ms step_avg:131.53ms | |
| step:766/1395 train_time:99445ms step_avg:131.54ms | |
| step:767/1395 train_time:99582ms step_avg:131.55ms | |
| step:768/1395 train_time:99722ms step_avg:131.56ms | |
| step:769/1395 train_time:99862ms step_avg:131.57ms | |
| step:770/1395 train_time:99999ms step_avg:131.58ms | |
| step:771/1395 train_time:100139ms step_avg:131.59ms | |
| step:772/1395 train_time:100278ms step_avg:131.60ms | |
| step:773/1395 train_time:100417ms step_avg:131.61ms | |
| step:774/1395 train_time:100557ms step_avg:131.62ms | |
| step:775/1395 train_time:100695ms step_avg:131.63ms | |
| step:776/1395 train_time:100834ms step_avg:131.64ms | |
| step:777/1395 train_time:100970ms step_avg:131.64ms | |
| step:778/1395 train_time:101109ms step_avg:131.65ms | |
| step:779/1395 train_time:101246ms step_avg:131.66ms | |
| step:780/1395 train_time:101383ms step_avg:131.67ms | |
| step:781/1395 train_time:101521ms step_avg:131.67ms | |
| step:782/1395 train_time:101660ms step_avg:131.68ms | |
| step:783/1395 train_time:101796ms step_avg:131.69ms | |
| step:784/1395 train_time:101933ms step_avg:131.70ms | |
| step:785/1395 train_time:102071ms step_avg:131.70ms | |
| step:786/1395 train_time:102210ms step_avg:131.71ms | |
| step:787/1395 train_time:102348ms step_avg:131.72ms | |
| step:788/1395 train_time:102484ms step_avg:131.73ms | |
| step:789/1395 train_time:102621ms step_avg:131.73ms | |
| step:790/1395 train_time:102760ms step_avg:131.74ms | |
| step:791/1395 train_time:102897ms step_avg:131.75ms | |
| step:792/1395 train_time:103037ms step_avg:131.76ms | |
| step:793/1395 train_time:103175ms step_avg:131.77ms | |
| step:794/1395 train_time:103315ms step_avg:131.78ms | |
| step:795/1395 train_time:103454ms step_avg:131.79ms | |
| step:796/1395 train_time:103594ms step_avg:131.80ms | |
| step:797/1395 train_time:103732ms step_avg:131.81ms | |
| step:798/1395 train_time:103869ms step_avg:131.81ms | |
| step:799/1395 train_time:104007ms step_avg:131.82ms | |
| step:800/1395 train_time:104143ms step_avg:131.83ms | |
| step:801/1395 train_time:104281ms step_avg:131.83ms | |
| step:802/1395 train_time:104420ms step_avg:131.84ms | |
| step:803/1395 train_time:104556ms step_avg:131.85ms | |
| step:804/1395 train_time:104695ms step_avg:131.86ms | |
| step:805/1395 train_time:104833ms step_avg:131.87ms | |
| step:806/1395 train_time:104973ms step_avg:131.88ms | |
| step:807/1395 train_time:105111ms step_avg:131.88ms | |
| step:808/1395 train_time:105250ms step_avg:131.89ms | |
| step:809/1395 train_time:105386ms step_avg:131.90ms | |
| step:810/1395 train_time:105523ms step_avg:131.90ms | |
| step:811/1395 train_time:105660ms step_avg:131.91ms | |
| step:812/1395 train_time:105800ms step_avg:131.92ms | |
| step:813/1395 train_time:105938ms step_avg:131.93ms | |
| step:814/1395 train_time:106074ms step_avg:131.93ms | |
| step:815/1395 train_time:106211ms step_avg:131.94ms | |
| step:816/1395 train_time:106348ms step_avg:131.95ms | |
| step:817/1395 train_time:106485ms step_avg:131.95ms | |
| step:818/1395 train_time:106621ms step_avg:131.96ms | |
| step:819/1395 train_time:106761ms step_avg:131.97ms | |
| step:820/1395 train_time:106901ms step_avg:131.98ms | |
| step:821/1395 train_time:107038ms step_avg:131.98ms | |
| step:822/1395 train_time:107176ms step_avg:131.99ms | |
| step:823/1395 train_time:107312ms step_avg:132.00ms | |
| step:824/1395 train_time:107448ms step_avg:132.00ms | |
| step:825/1395 train_time:107584ms step_avg:132.00ms | |
| step:826/1395 train_time:107723ms step_avg:132.01ms | |
| step:827/1395 train_time:107861ms step_avg:132.02ms | |
| step:828/1395 train_time:108000ms step_avg:132.03ms | |
| step:829/1395 train_time:108139ms step_avg:132.04ms | |
| step:830/1395 train_time:108278ms step_avg:132.05ms | |
| step:831/1395 train_time:108416ms step_avg:132.05ms | |
| step:832/1395 train_time:108558ms step_avg:132.07ms | |
| step:833/1395 train_time:108697ms step_avg:132.07ms | |
| step:834/1395 train_time:108837ms step_avg:132.08ms | |
| step:835/1395 train_time:108978ms step_avg:132.09ms | |
| step:836/1395 train_time:109118ms step_avg:132.10ms | |
| step:837/1395 train_time:109257ms step_avg:132.11ms | |
| step:838/1395 train_time:109394ms step_avg:132.12ms | |
| step:839/1395 train_time:109531ms step_avg:132.12ms | |
| step:840/1395 train_time:109667ms step_avg:132.13ms | |
| step:841/1395 train_time:109805ms step_avg:132.14ms | |
| step:842/1395 train_time:109945ms step_avg:132.15ms | |
| step:843/1395 train_time:110082ms step_avg:132.15ms | |
| step:844/1395 train_time:110221ms step_avg:132.16ms | |
| step:845/1395 train_time:110358ms step_avg:132.17ms | |
| step:846/1395 train_time:110499ms step_avg:132.18ms | |
| step:847/1395 train_time:110637ms step_avg:132.18ms | |
| step:848/1395 train_time:110773ms step_avg:132.19ms | |
| step:849/1395 train_time:110914ms step_avg:132.20ms | |
| step:850/1395 train_time:111054ms step_avg:132.21ms | |
| step:851/1395 train_time:111194ms step_avg:132.22ms | |
| step:852/1395 train_time:111333ms step_avg:132.22ms | |
| step:853/1395 train_time:111469ms step_avg:132.23ms | |
| step:854/1395 train_time:111607ms step_avg:132.24ms | |
| step:855/1395 train_time:111744ms step_avg:132.24ms | |
| step:856/1395 train_time:111882ms step_avg:132.25ms | |
| step:857/1395 train_time:112020ms step_avg:132.26ms | |
| step:858/1395 train_time:112162ms step_avg:132.27ms | |
| step:859/1395 train_time:112301ms step_avg:132.27ms | |
| step:860/1395 train_time:112437ms step_avg:132.28ms | |
| step:861/1395 train_time:112578ms step_avg:132.29ms | |
| step:862/1395 train_time:112717ms step_avg:132.30ms | |
| step:863/1395 train_time:112859ms step_avg:132.31ms | |
| step:864/1395 train_time:112998ms step_avg:132.32ms | |
| step:865/1395 train_time:113135ms step_avg:132.32ms | |
| step:866/1395 train_time:113281ms step_avg:132.34ms | |
| step:867/1395 train_time:113421ms step_avg:132.35ms | |
| step:868/1395 train_time:113559ms step_avg:132.35ms | |
| step:869/1395 train_time:113696ms step_avg:132.36ms | |
| step:870/1395 train_time:113834ms step_avg:132.37ms | |
| step:871/1395 train_time:113975ms step_avg:132.38ms | |
| step:872/1395 train_time:114113ms step_avg:132.38ms | |
| step:873/1395 train_time:114256ms step_avg:132.39ms | |
| step:874/1395 train_time:114396ms step_avg:132.40ms | |
| step:875/1395 train_time:114537ms step_avg:132.41ms | |
| step:875/1395 val_loss:3.4735 train_time:114648ms step_avg:132.54ms | |
| step:876/1395 train_time:114682ms step_avg:132.43ms | |
| step:877/1395 train_time:114827ms step_avg:132.44ms | |
| step:878/1395 train_time:114966ms step_avg:132.45ms | |
| step:879/1395 train_time:115103ms step_avg:132.45ms | |
| step:880/1395 train_time:115240ms step_avg:132.46ms | |
| step:881/1395 train_time:115379ms step_avg:132.47ms | |
| step:882/1395 train_time:115516ms step_avg:132.47ms | |
| step:883/1395 train_time:115654ms step_avg:132.48ms | |
| step:884/1395 train_time:115794ms step_avg:132.49ms | |
| step:885/1395 train_time:115934ms step_avg:132.50ms | |
| step:886/1395 train_time:116073ms step_avg:132.50ms | |
| step:887/1395 train_time:116210ms step_avg:132.51ms | |
| step:888/1395 train_time:116351ms step_avg:132.52ms | |
| step:889/1395 train_time:116492ms step_avg:132.53ms | |
| step:890/1395 train_time:116628ms step_avg:132.53ms | |
| step:891/1395 train_time:116767ms step_avg:132.54ms | |
| step:892/1395 train_time:116905ms step_avg:132.55ms | |
| step:893/1395 train_time:117043ms step_avg:132.55ms | |
| step:894/1395 train_time:117182ms step_avg:132.56ms | |
| step:895/1395 train_time:117320ms step_avg:132.56ms | |
| step:896/1395 train_time:117459ms step_avg:132.57ms | |
| step:897/1395 train_time:117599ms step_avg:132.58ms | |
| step:898/1395 train_time:117740ms step_avg:132.59ms | |
| step:899/1395 train_time:117880ms step_avg:132.60ms | |
| step:900/1395 train_time:118018ms step_avg:132.60ms | |
| step:901/1395 train_time:118157ms step_avg:132.61ms | |
| step:902/1395 train_time:118294ms step_avg:132.62ms | |
| step:903/1395 train_time:118437ms step_avg:132.63ms | |
| step:904/1395 train_time:118577ms step_avg:132.64ms | |
| step:905/1395 train_time:118715ms step_avg:132.64ms | |
| step:906/1395 train_time:118854ms step_avg:132.65ms | |
| step:907/1395 train_time:118994ms step_avg:132.66ms | |
| step:908/1395 train_time:119133ms step_avg:132.66ms | |
| step:909/1395 train_time:119271ms step_avg:132.67ms | |
| step:910/1395 train_time:119412ms step_avg:132.68ms | |
| step:911/1395 train_time:119550ms step_avg:132.69ms | |
| step:912/1395 train_time:119687ms step_avg:132.69ms | |
| step:913/1395 train_time:119825ms step_avg:132.70ms | |
| step:914/1395 train_time:119964ms step_avg:132.70ms | |
| step:915/1395 train_time:120106ms step_avg:132.71ms | |
| step:916/1395 train_time:120244ms step_avg:132.72ms | |
| step:917/1395 train_time:120384ms step_avg:132.73ms | |
| step:918/1395 train_time:120525ms step_avg:132.74ms | |
| step:919/1395 train_time:120666ms step_avg:132.75ms | |
| step:920/1395 train_time:120804ms step_avg:132.75ms | |
| step:921/1395 train_time:120945ms step_avg:132.76ms | |
| step:922/1395 train_time:121084ms step_avg:132.77ms | |
| step:923/1395 train_time:121221ms step_avg:132.77ms | |
| step:924/1395 train_time:121362ms step_avg:132.78ms | |
| step:925/1395 train_time:121500ms step_avg:132.79ms | |
| step:926/1395 train_time:121639ms step_avg:132.79ms | |
| step:927/1395 train_time:121779ms step_avg:132.80ms | |
| step:928/1395 train_time:121919ms step_avg:132.81ms | |
| step:929/1395 train_time:122057ms step_avg:132.82ms | |
| step:930/1395 train_time:122197ms step_avg:132.82ms | |
| step:931/1395 train_time:122337ms step_avg:132.83ms | |
| step:932/1395 train_time:122477ms step_avg:132.84ms | |
| step:933/1395 train_time:122619ms step_avg:132.85ms | |
| step:934/1395 train_time:122760ms step_avg:132.86ms | |
| step:935/1395 train_time:122903ms step_avg:132.87ms | |
| step:936/1395 train_time:123041ms step_avg:132.87ms | |
| step:937/1395 train_time:123188ms step_avg:132.89ms | |
| step:938/1395 train_time:123328ms step_avg:132.90ms | |
| step:939/1395 train_time:123468ms step_avg:132.90ms | |
| step:940/1395 train_time:123610ms step_avg:132.91ms | |
| step:941/1395 train_time:123748ms step_avg:132.92ms | |
| step:942/1395 train_time:123888ms step_avg:132.93ms | |
| step:943/1395 train_time:124031ms step_avg:132.94ms | |
| step:944/1395 train_time:124180ms step_avg:132.95ms | |
| step:945/1395 train_time:124321ms step_avg:132.96ms | |
| step:946/1395 train_time:124465ms step_avg:132.98ms | |
| step:947/1395 train_time:124607ms step_avg:132.99ms | |
| step:948/1395 train_time:124748ms step_avg:132.99ms | |
| step:949/1395 train_time:124887ms step_avg:133.00ms | |
| step:950/1395 train_time:125025ms step_avg:133.01ms | |
| step:951/1395 train_time:125167ms step_avg:133.01ms | |
| step:952/1395 train_time:125306ms step_avg:133.02ms | |
| step:953/1395 train_time:125447ms step_avg:133.03ms | |
| step:954/1395 train_time:125584ms step_avg:133.03ms | |
| step:955/1395 train_time:125726ms step_avg:133.04ms | |
| step:956/1395 train_time:125867ms step_avg:133.05ms | |
| step:957/1395 train_time:126009ms step_avg:133.06ms | |
| step:958/1395 train_time:126149ms step_avg:133.07ms | |
| step:959/1395 train_time:126291ms step_avg:133.08ms | |
| step:960/1395 train_time:126433ms step_avg:133.09ms | |
| step:961/1395 train_time:126571ms step_avg:133.09ms | |
| step:962/1395 train_time:126710ms step_avg:133.10ms | |
| step:963/1395 train_time:126855ms step_avg:133.11ms | |
| step:964/1395 train_time:126994ms step_avg:133.12ms | |
| step:965/1395 train_time:127133ms step_avg:133.12ms | |
| step:966/1395 train_time:127271ms step_avg:133.13ms | |
| step:967/1395 train_time:127411ms step_avg:133.14ms | |
| step:968/1395 train_time:127550ms step_avg:133.14ms | |
| step:969/1395 train_time:127690ms step_avg:133.15ms | |
| step:970/1395 train_time:127830ms step_avg:133.16ms | |
| step:971/1395 train_time:127969ms step_avg:133.16ms | |
| step:972/1395 train_time:128107ms step_avg:133.17ms | |
| step:973/1395 train_time:128247ms step_avg:133.17ms | |
| step:974/1395 train_time:128388ms step_avg:133.18ms | |
| step:975/1395 train_time:128525ms step_avg:133.19ms | |
| step:976/1395 train_time:128665ms step_avg:133.19ms | |
| step:977/1395 train_time:128803ms step_avg:133.20ms | |
| step:978/1395 train_time:128943ms step_avg:133.21ms | |
| step:979/1395 train_time:129083ms step_avg:133.21ms | |
| step:980/1395 train_time:129223ms step_avg:133.22ms | |
| step:981/1395 train_time:129362ms step_avg:133.23ms | |
| step:982/1395 train_time:129501ms step_avg:133.23ms | |
| step:983/1395 train_time:129639ms step_avg:133.24ms | |
| step:984/1395 train_time:129780ms step_avg:133.24ms | |
| step:985/1395 train_time:129921ms step_avg:133.25ms | |
| step:986/1395 train_time:130067ms step_avg:133.27ms | |
| step:987/1395 train_time:130207ms step_avg:133.27ms | |
| step:988/1395 train_time:130347ms step_avg:133.28ms | |
| step:989/1395 train_time:130485ms step_avg:133.28ms | |
| step:990/1395 train_time:130626ms step_avg:133.29ms | |
| step:991/1395 train_time:130765ms step_avg:133.30ms | |
| step:992/1395 train_time:130906ms step_avg:133.31ms | |
| step:993/1395 train_time:131051ms step_avg:133.32ms | |
| step:994/1395 train_time:131189ms step_avg:133.32ms | |
| step:995/1395 train_time:131326ms step_avg:133.33ms | |
| step:996/1395 train_time:131464ms step_avg:133.33ms | |
| step:997/1395 train_time:131602ms step_avg:133.34ms | |
| step:998/1395 train_time:131740ms step_avg:133.34ms | |
| step:999/1395 train_time:131880ms step_avg:133.35ms | |
| step:1000/1395 train_time:132020ms step_avg:133.35ms | |
| step:1000/1395 val_loss:3.4101 train_time:132138ms step_avg:133.47ms | |
| step:1001/1395 train_time:132171ms step_avg:133.37ms | |
| step:1002/1395 train_time:132309ms step_avg:133.38ms | |
| step:1003/1395 train_time:132451ms step_avg:133.38ms | |
| step:1004/1395 train_time:132591ms step_avg:133.39ms | |
| step:1005/1395 train_time:132732ms step_avg:133.40ms | |
| step:1006/1395 train_time:132872ms step_avg:133.41ms | |
| step:1007/1395 train_time:133012ms step_avg:133.41ms | |
| step:1008/1395 train_time:133156ms step_avg:133.42ms | |
| step:1009/1395 train_time:133299ms step_avg:133.43ms | |
| step:1010/1395 train_time:133437ms step_avg:133.44ms | |
| step:1011/1395 train_time:133577ms step_avg:133.44ms | |
| step:1012/1395 train_time:133715ms step_avg:133.45ms | |
| step:1013/1395 train_time:133858ms step_avg:133.46ms | |
| step:1014/1395 train_time:133997ms step_avg:133.46ms | |
| step:1015/1395 train_time:134135ms step_avg:133.47ms | |
| step:1016/1395 train_time:134276ms step_avg:133.48ms | |
| step:1017/1395 train_time:134418ms step_avg:133.48ms | |
| step:1018/1395 train_time:134557ms step_avg:133.49ms | |
| step:1019/1395 train_time:134698ms step_avg:133.50ms | |
| step:1020/1395 train_time:134841ms step_avg:133.51ms | |
| step:1021/1395 train_time:134979ms step_avg:133.51ms | |
| step:1022/1395 train_time:135118ms step_avg:133.52ms | |
| step:1023/1395 train_time:135259ms step_avg:133.52ms | |
| step:1024/1395 train_time:135397ms step_avg:133.53ms | |
| step:1025/1395 train_time:135538ms step_avg:133.53ms | |
| step:1026/1395 train_time:135676ms step_avg:133.54ms | |
| step:1027/1395 train_time:135817ms step_avg:133.55ms | |
| step:1028/1395 train_time:135960ms step_avg:133.56ms | |
| step:1029/1395 train_time:136103ms step_avg:133.57ms | |
| step:1030/1395 train_time:136243ms step_avg:133.57ms | |
| step:1031/1395 train_time:136381ms step_avg:133.58ms | |
| step:1032/1395 train_time:136520ms step_avg:133.58ms | |
| step:1033/1395 train_time:136657ms step_avg:133.58ms | |
| step:1034/1395 train_time:136798ms step_avg:133.59ms | |
| step:1035/1395 train_time:136941ms step_avg:133.60ms | |
| step:1036/1395 train_time:137080ms step_avg:133.61ms | |
| step:1037/1395 train_time:137224ms step_avg:133.62ms | |
| step:1038/1395 train_time:137369ms step_avg:133.63ms | |
| step:1039/1395 train_time:137510ms step_avg:133.63ms | |
| step:1040/1395 train_time:137650ms step_avg:133.64ms | |
| step:1041/1395 train_time:137791ms step_avg:133.65ms | |
| step:1042/1395 train_time:137933ms step_avg:133.66ms | |
| step:1043/1395 train_time:138074ms step_avg:133.66ms | |
| step:1044/1395 train_time:138217ms step_avg:133.67ms | |
| step:1045/1395 train_time:138362ms step_avg:133.68ms | |
| step:1046/1395 train_time:138503ms step_avg:133.69ms | |
| step:1047/1395 train_time:138643ms step_avg:133.70ms | |
| step:1048/1395 train_time:138782ms step_avg:133.70ms | |
| step:1049/1395 train_time:138920ms step_avg:133.71ms | |
| step:1050/1395 train_time:139059ms step_avg:133.71ms | |
| step:1051/1395 train_time:139201ms step_avg:133.72ms | |
| step:1052/1395 train_time:139342ms step_avg:133.73ms | |
| step:1053/1395 train_time:139480ms step_avg:133.73ms | |
| step:1054/1395 train_time:139623ms step_avg:133.74ms | |
| step:1055/1395 train_time:139763ms step_avg:133.74ms | |
| step:1056/1395 train_time:139900ms step_avg:133.75ms | |
| step:1057/1395 train_time:140039ms step_avg:133.75ms | |
| step:1058/1395 train_time:140181ms step_avg:133.76ms | |
| step:1059/1395 train_time:140323ms step_avg:133.77ms | |
| step:1060/1395 train_time:140465ms step_avg:133.78ms | |
| step:1061/1395 train_time:140605ms step_avg:133.78ms | |
| step:1062/1395 train_time:140749ms step_avg:133.79ms | |
| step:1063/1395 train_time:140889ms step_avg:133.80ms | |
| step:1064/1395 train_time:141032ms step_avg:133.81ms | |
| step:1065/1395 train_time:141173ms step_avg:133.81ms | |
| step:1066/1395 train_time:141315ms step_avg:133.82ms | |
| step:1067/1395 train_time:141458ms step_avg:133.83ms | |
| step:1068/1395 train_time:141598ms step_avg:133.84ms | |
| step:1069/1395 train_time:141740ms step_avg:133.84ms | |
| step:1070/1395 train_time:141879ms step_avg:133.85ms | |
| step:1071/1395 train_time:142024ms step_avg:133.86ms | |
| step:1072/1395 train_time:142164ms step_avg:133.86ms | |
| step:1073/1395 train_time:142302ms step_avg:133.87ms | |
| step:1074/1395 train_time:142442ms step_avg:133.87ms | |
| step:1075/1395 train_time:142585ms step_avg:133.88ms | |
| step:1076/1395 train_time:142727ms step_avg:133.89ms | |
| step:1077/1395 train_time:142869ms step_avg:133.90ms | |
| step:1078/1395 train_time:143012ms step_avg:133.91ms | |
| step:1079/1395 train_time:143158ms step_avg:133.92ms | |
| step:1080/1395 train_time:143299ms step_avg:133.92ms | |
| step:1081/1395 train_time:143439ms step_avg:133.93ms | |
| step:1082/1395 train_time:143578ms step_avg:133.93ms | |
| step:1083/1395 train_time:143718ms step_avg:133.94ms | |
| step:1084/1395 train_time:143864ms step_avg:133.95ms | |
| step:1085/1395 train_time:144004ms step_avg:133.96ms | |
| step:1086/1395 train_time:144146ms step_avg:133.96ms | |
| step:1087/1395 train_time:144284ms step_avg:133.97ms | |
| step:1088/1395 train_time:144427ms step_avg:133.98ms | |
| step:1089/1395 train_time:144570ms step_avg:133.99ms | |
| step:1090/1395 train_time:144714ms step_avg:133.99ms | |
| step:1091/1395 train_time:144855ms step_avg:134.00ms | |
| step:1092/1395 train_time:144997ms step_avg:134.01ms | |
| step:1093/1395 train_time:145136ms step_avg:134.01ms | |
| step:1094/1395 train_time:145277ms step_avg:134.02ms | |
| step:1095/1395 train_time:145417ms step_avg:134.02ms | |
| step:1096/1395 train_time:145561ms step_avg:134.03ms | |
| step:1097/1395 train_time:145703ms step_avg:134.04ms | |
| step:1098/1395 train_time:145845ms step_avg:134.05ms | |
| step:1099/1395 train_time:145986ms step_avg:134.05ms | |
| step:1100/1395 train_time:146126ms step_avg:134.06ms | |
| step:1101/1395 train_time:146265ms step_avg:134.07ms | |
| step:1102/1395 train_time:146406ms step_avg:134.07ms | |
| step:1103/1395 train_time:146549ms step_avg:134.08ms | |
| step:1104/1395 train_time:146690ms step_avg:134.09ms | |
| step:1105/1395 train_time:146836ms step_avg:134.10ms | |
| step:1106/1395 train_time:146978ms step_avg:134.10ms | |
| step:1107/1395 train_time:147117ms step_avg:134.11ms | |
| step:1108/1395 train_time:147262ms step_avg:134.12ms | |
| step:1109/1395 train_time:147402ms step_avg:134.12ms | |
| step:1110/1395 train_time:147542ms step_avg:134.13ms | |
| step:1111/1395 train_time:147682ms step_avg:134.13ms | |
| step:1112/1395 train_time:147820ms step_avg:134.14ms | |
| step:1113/1395 train_time:147958ms step_avg:134.14ms | |
| step:1114/1395 train_time:148104ms step_avg:134.15ms | |
| step:1115/1395 train_time:148249ms step_avg:134.16ms | |
| step:1116/1395 train_time:148389ms step_avg:134.17ms | |
| step:1117/1395 train_time:148529ms step_avg:134.17ms | |
| step:1118/1395 train_time:148676ms step_avg:134.18ms | |
| step:1119/1395 train_time:148814ms step_avg:134.19ms | |
| step:1120/1395 train_time:148957ms step_avg:134.20ms | |
| step:1121/1395 train_time:149099ms step_avg:134.20ms | |
| step:1122/1395 train_time:149239ms step_avg:134.21ms | |
| step:1123/1395 train_time:149379ms step_avg:134.21ms | |
| step:1124/1395 train_time:149519ms step_avg:134.22ms | |
| step:1125/1395 train_time:149658ms step_avg:134.22ms | |
| step:1125/1395 val_loss:3.3609 train_time:149772ms step_avg:134.32ms | |
| step:1126/1395 train_time:149805ms step_avg:134.23ms | |
| step:1127/1395 train_time:149950ms step_avg:134.24ms | |
| step:1128/1395 train_time:150093ms step_avg:134.25ms | |
| step:1129/1395 train_time:150240ms step_avg:134.26ms | |
| step:1130/1395 train_time:150382ms step_avg:134.27ms | |
| step:1131/1395 train_time:150529ms step_avg:134.28ms | |
| step:1132/1395 train_time:150668ms step_avg:134.29ms | |
| step:1133/1395 train_time:150808ms step_avg:134.29ms | |
| step:1134/1395 train_time:150949ms step_avg:134.30ms | |
| step:1135/1395 train_time:151088ms step_avg:134.30ms | |
| step:1136/1395 train_time:151234ms step_avg:134.31ms | |
| step:1137/1395 train_time:151374ms step_avg:134.32ms | |
| step:1138/1395 train_time:151518ms step_avg:134.32ms | |
| step:1139/1395 train_time:151659ms step_avg:134.33ms | |
| step:1140/1395 train_time:151800ms step_avg:134.34ms | |
| step:1141/1395 train_time:151940ms step_avg:134.34ms | |
| step:1142/1395 train_time:152081ms step_avg:134.35ms | |
| step:1143/1395 train_time:152224ms step_avg:134.36ms | |
| step:1144/1395 train_time:152365ms step_avg:134.36ms | |
| step:1145/1395 train_time:152504ms step_avg:134.36ms | |
| step:1146/1395 train_time:152645ms step_avg:134.37ms | |
| step:1147/1395 train_time:152790ms step_avg:134.38ms | |
| step:1148/1395 train_time:152933ms step_avg:134.39ms | |
| step:1149/1395 train_time:153075ms step_avg:134.39ms | |
| step:1150/1395 train_time:153216ms step_avg:134.40ms | |
| step:1151/1395 train_time:153362ms step_avg:134.41ms | |
| step:1152/1395 train_time:153503ms step_avg:134.42ms | |
| step:1153/1395 train_time:153649ms step_avg:134.43ms | |
| step:1154/1395 train_time:153788ms step_avg:134.43ms | |
| step:1155/1395 train_time:153929ms step_avg:134.44ms | |
| step:1156/1395 train_time:154079ms step_avg:134.45ms | |
| step:1157/1395 train_time:154221ms step_avg:134.46ms | |
| step:1158/1395 train_time:154361ms step_avg:134.46ms | |
| step:1159/1395 train_time:154503ms step_avg:134.47ms | |
| step:1160/1395 train_time:154644ms step_avg:134.47ms | |
| step:1161/1395 train_time:154787ms step_avg:134.48ms | |
| step:1162/1395 train_time:154929ms step_avg:134.49ms | |
| step:1163/1395 train_time:155070ms step_avg:134.49ms | |
| step:1164/1395 train_time:155212ms step_avg:134.50ms | |
| step:1165/1395 train_time:155352ms step_avg:134.50ms | |
| step:1166/1395 train_time:155493ms step_avg:134.51ms | |
| step:1167/1395 train_time:155635ms step_avg:134.52ms | |
| step:1168/1395 train_time:155776ms step_avg:134.52ms | |
| step:1169/1395 train_time:155918ms step_avg:134.53ms | |
| step:1170/1395 train_time:156059ms step_avg:134.53ms | |
| step:1171/1395 train_time:156200ms step_avg:134.54ms | |
| step:1172/1395 train_time:156344ms step_avg:134.55ms | |
| step:1173/1395 train_time:156485ms step_avg:134.55ms | |
| step:1174/1395 train_time:156638ms step_avg:134.57ms | |
| step:1175/1395 train_time:156781ms step_avg:134.58ms | |
| step:1176/1395 train_time:156923ms step_avg:134.58ms | |
| step:1177/1395 train_time:157070ms step_avg:134.59ms | |
| step:1178/1395 train_time:157211ms step_avg:134.60ms | |
| step:1179/1395 train_time:157350ms step_avg:134.60ms | |
| step:1180/1395 train_time:157497ms step_avg:134.61ms | |
| step:1181/1395 train_time:157641ms step_avg:134.62ms | |
| step:1182/1395 train_time:157781ms step_avg:134.63ms | |
| step:1183/1395 train_time:157923ms step_avg:134.63ms | |
| step:1184/1395 train_time:158065ms step_avg:134.64ms | |
| step:1185/1395 train_time:158207ms step_avg:134.64ms | |
| step:1186/1395 train_time:158350ms step_avg:134.65ms | |
| step:1187/1395 train_time:158502ms step_avg:134.67ms | |
| step:1188/1395 train_time:158643ms step_avg:134.67ms | |
| step:1189/1395 train_time:158791ms step_avg:134.68ms | |
| step:1190/1395 train_time:158932ms step_avg:134.69ms | |
| step:1191/1395 train_time:159074ms step_avg:134.69ms | |
| step:1192/1395 train_time:159213ms step_avg:134.70ms | |
| step:1193/1395 train_time:159352ms step_avg:134.70ms | |
| step:1194/1395 train_time:159493ms step_avg:134.71ms | |
| step:1195/1395 train_time:159636ms step_avg:134.71ms | |
| step:1196/1395 train_time:159779ms step_avg:134.72ms | |
| step:1197/1395 train_time:159921ms step_avg:134.73ms | |
| step:1198/1395 train_time:160070ms step_avg:134.74ms | |
| step:1199/1395 train_time:160211ms step_avg:134.74ms | |
| step:1200/1395 train_time:160351ms step_avg:134.75ms | |
| step:1201/1395 train_time:160490ms step_avg:134.75ms | |
| step:1202/1395 train_time:160643ms step_avg:134.77ms | |
| step:1203/1395 train_time:160792ms step_avg:134.78ms | |
| step:1204/1395 train_time:160938ms step_avg:134.79ms | |
| step:1205/1395 train_time:161080ms step_avg:134.80ms | |
| step:1206/1395 train_time:161224ms step_avg:134.80ms | |
| step:1207/1395 train_time:161368ms step_avg:134.81ms | |
| step:1208/1395 train_time:161513ms step_avg:134.82ms | |
| step:1209/1395 train_time:161656ms step_avg:134.83ms | |
| step:1210/1395 train_time:161801ms step_avg:134.83ms | |
| step:1211/1395 train_time:161947ms step_avg:134.84ms | |
| step:1212/1395 train_time:162089ms step_avg:134.85ms | |
| step:1213/1395 train_time:162231ms step_avg:134.86ms | |
| step:1214/1395 train_time:162375ms step_avg:134.86ms | |
| step:1215/1395 train_time:162520ms step_avg:134.87ms | |
| step:1216/1395 train_time:162659ms step_avg:134.88ms | |
| step:1217/1395 train_time:162803ms step_avg:134.88ms | |
| step:1218/1395 train_time:162944ms step_avg:134.89ms | |
| step:1219/1395 train_time:163085ms step_avg:134.89ms | |
| step:1220/1395 train_time:163228ms step_avg:134.90ms | |
| step:1221/1395 train_time:163367ms step_avg:134.90ms | |
| step:1222/1395 train_time:163508ms step_avg:134.91ms | |
| step:1223/1395 train_time:163648ms step_avg:134.91ms | |
| step:1224/1395 train_time:163791ms step_avg:134.92ms | |
| step:1225/1395 train_time:163935ms step_avg:134.93ms | |
| step:1226/1395 train_time:164075ms step_avg:134.93ms | |
| step:1227/1395 train_time:164215ms step_avg:134.93ms | |
| step:1228/1395 train_time:164357ms step_avg:134.94ms | |
| step:1229/1395 train_time:164499ms step_avg:134.95ms | |
| step:1230/1395 train_time:164647ms step_avg:134.96ms | |
| step:1231/1395 train_time:164791ms step_avg:134.96ms | |
| step:1232/1395 train_time:164933ms step_avg:134.97ms | |
| step:1233/1395 train_time:165075ms step_avg:134.98ms | |
| step:1234/1395 train_time:165217ms step_avg:134.98ms | |
| step:1235/1395 train_time:165360ms step_avg:134.99ms | |
| step:1236/1395 train_time:165502ms step_avg:134.99ms | |
| step:1237/1395 train_time:165645ms step_avg:135.00ms | |
| step:1238/1395 train_time:165796ms step_avg:135.01ms | |
| step:1239/1395 train_time:165939ms step_avg:135.02ms | |
| step:1240/1395 train_time:166083ms step_avg:135.03ms | |
| step:1241/1395 train_time:166228ms step_avg:135.04ms | |
| step:1242/1395 train_time:166369ms step_avg:135.04ms | |
| step:1243/1395 train_time:166517ms step_avg:135.05ms | |
| step:1244/1395 train_time:166658ms step_avg:135.05ms | |
| step:1245/1395 train_time:166799ms step_avg:135.06ms | |
| step:1246/1395 train_time:166942ms step_avg:135.07ms | |
| step:1247/1395 train_time:167086ms step_avg:135.07ms | |
| step:1248/1395 train_time:167226ms step_avg:135.08ms | |
| step:1249/1395 train_time:167366ms step_avg:135.08ms | |
| step:1250/1395 train_time:167508ms step_avg:135.09ms | |
| step:1250/1395 val_loss:3.3151 train_time:167627ms step_avg:135.18ms | |
| step:1251/1395 train_time:167660ms step_avg:135.10ms | |
| step:1252/1395 train_time:167805ms step_avg:135.11ms | |
| step:1253/1395 train_time:167944ms step_avg:135.11ms | |
| step:1254/1395 train_time:168084ms step_avg:135.12ms | |
| step:1255/1395 train_time:168241ms step_avg:135.13ms | |
| step:1256/1395 train_time:168382ms step_avg:135.14ms | |
| step:1257/1395 train_time:168524ms step_avg:135.14ms | |
| step:1258/1395 train_time:168670ms step_avg:135.15ms | |
| step:1259/1395 train_time:168818ms step_avg:135.16ms | |
| step:1260/1395 train_time:168957ms step_avg:135.17ms | |
| step:1261/1395 train_time:169102ms step_avg:135.17ms | |
| step:1262/1395 train_time:169245ms step_avg:135.18ms | |
| step:1263/1395 train_time:169392ms step_avg:135.19ms | |
| step:1264/1395 train_time:169536ms step_avg:135.20ms | |
| step:1265/1395 train_time:169676ms step_avg:135.20ms | |
| step:1266/1395 train_time:169821ms step_avg:135.21ms | |
| step:1267/1395 train_time:169965ms step_avg:135.22ms | |
| step:1268/1395 train_time:170110ms step_avg:135.22ms | |
| step:1269/1395 train_time:170257ms step_avg:135.23ms | |
| step:1270/1395 train_time:170401ms step_avg:135.24ms | |
| step:1271/1395 train_time:170541ms step_avg:135.24ms | |
| step:1272/1395 train_time:170682ms step_avg:135.25ms | |
| step:1273/1395 train_time:170824ms step_avg:135.25ms | |
| step:1274/1395 train_time:170967ms step_avg:135.26ms | |
| step:1275/1395 train_time:171112ms step_avg:135.27ms | |
| step:1276/1395 train_time:171253ms step_avg:135.27ms | |
| step:1277/1395 train_time:171395ms step_avg:135.28ms | |
| step:1278/1395 train_time:171537ms step_avg:135.28ms | |
| step:1279/1395 train_time:171679ms step_avg:135.29ms | |
| step:1280/1395 train_time:171831ms step_avg:135.30ms | |
| step:1281/1395 train_time:171973ms step_avg:135.31ms | |
| step:1282/1395 train_time:172115ms step_avg:135.31ms | |
| step:1283/1395 train_time:172255ms step_avg:135.31ms | |
| step:1284/1395 train_time:172397ms step_avg:135.32ms | |
| step:1285/1395 train_time:172541ms step_avg:135.33ms | |
| step:1286/1395 train_time:172683ms step_avg:135.33ms | |
| step:1287/1395 train_time:172827ms step_avg:135.34ms | |
| step:1288/1395 train_time:172967ms step_avg:135.34ms | |
| step:1289/1395 train_time:173119ms step_avg:135.35ms | |
| step:1290/1395 train_time:173267ms step_avg:135.36ms | |
| step:1291/1395 train_time:173412ms step_avg:135.37ms | |
| step:1292/1395 train_time:173557ms step_avg:135.38ms | |
| step:1293/1395 train_time:173706ms step_avg:135.39ms | |
| step:1294/1395 train_time:173846ms step_avg:135.39ms | |
| step:1295/1395 train_time:173988ms step_avg:135.40ms | |
| step:1296/1395 train_time:174137ms step_avg:135.41ms | |
| step:1297/1395 train_time:174283ms step_avg:135.42ms | |
| step:1298/1395 train_time:174425ms step_avg:135.42ms | |
| step:1299/1395 train_time:174566ms step_avg:135.43ms | |
| step:1300/1395 train_time:174705ms step_avg:135.43ms | |
| step:1301/1395 train_time:174846ms step_avg:135.43ms | |
| step:1302/1395 train_time:174987ms step_avg:135.44ms | |
| step:1303/1395 train_time:175132ms step_avg:135.45ms | |
| step:1304/1395 train_time:175277ms step_avg:135.45ms | |
| step:1305/1395 train_time:175419ms step_avg:135.46ms | |
| step:1306/1395 train_time:175561ms step_avg:135.46ms | |
| step:1307/1395 train_time:175703ms step_avg:135.47ms | |
| step:1308/1395 train_time:175847ms step_avg:135.48ms | |
| step:1309/1395 train_time:175992ms step_avg:135.48ms | |
| step:1310/1395 train_time:176135ms step_avg:135.49ms | |
| step:1311/1395 train_time:176276ms step_avg:135.49ms | |
| step:1312/1395 train_time:176418ms step_avg:135.50ms | |
| step:1313/1395 train_time:176561ms step_avg:135.50ms | |
| step:1314/1395 train_time:176704ms step_avg:135.51ms | |
| step:1315/1395 train_time:176848ms step_avg:135.52ms | |
| step:1316/1395 train_time:176989ms step_avg:135.52ms | |
| step:1317/1395 train_time:177132ms step_avg:135.53ms | |
| step:1318/1395 train_time:177280ms step_avg:135.53ms | |
| step:1319/1395 train_time:177424ms step_avg:135.54ms | |
| step:1320/1395 train_time:177566ms step_avg:135.55ms | |
| step:1321/1395 train_time:177706ms step_avg:135.55ms | |
| step:1322/1395 train_time:177856ms step_avg:135.56ms | |
| step:1323/1395 train_time:178000ms step_avg:135.57ms | |
| step:1324/1395 train_time:178141ms step_avg:135.57ms | |
| step:1325/1395 train_time:178284ms step_avg:135.58ms | |
| step:1326/1395 train_time:178430ms step_avg:135.59ms | |
| step:1327/1395 train_time:178572ms step_avg:135.59ms | |
| step:1328/1395 train_time:178711ms step_avg:135.59ms | |
| step:1329/1395 train_time:178865ms step_avg:135.61ms | |
| step:1330/1395 train_time:179012ms step_avg:135.61ms | |
| step:1331/1395 train_time:179158ms step_avg:135.62ms | |
| step:1332/1395 train_time:179308ms step_avg:135.63ms | |
| step:1333/1395 train_time:179452ms step_avg:135.64ms | |
| step:1334/1395 train_time:179597ms step_avg:135.65ms | |
| step:1335/1395 train_time:179736ms step_avg:135.65ms | |
| step:1336/1395 train_time:179890ms step_avg:135.66ms | |
| step:1337/1395 train_time:180035ms step_avg:135.67ms | |
| step:1338/1395 train_time:180178ms step_avg:135.68ms | |
| step:1339/1395 train_time:180320ms step_avg:135.68ms | |
| step:1340/1395 train_time:180465ms step_avg:135.69ms | |
| step:1341/1395 train_time:180605ms step_avg:135.69ms | |
| step:1342/1395 train_time:180746ms step_avg:135.70ms | |
| step:1343/1395 train_time:180889ms step_avg:135.70ms | |
| step:1344/1395 train_time:181033ms step_avg:135.71ms | |
| step:1345/1395 train_time:181176ms step_avg:135.71ms | |
| step:1346/1395 train_time:181319ms step_avg:135.72ms | |
| step:1347/1395 train_time:181465ms step_avg:135.73ms | |
| step:1348/1395 train_time:181607ms step_avg:135.73ms | |
| step:1349/1395 train_time:181748ms step_avg:135.73ms | |
| step:1350/1395 train_time:181889ms step_avg:135.74ms | |
| step:1351/1395 train_time:182032ms step_avg:135.74ms | |
| step:1352/1395 train_time:182183ms step_avg:135.76ms | |
| step:1353/1395 train_time:182329ms step_avg:135.76ms | |
| step:1354/1395 train_time:182472ms step_avg:135.77ms | |
| step:1355/1395 train_time:182614ms step_avg:135.77ms | |
| step:1356/1395 train_time:182756ms step_avg:135.78ms | |
| step:1357/1395 train_time:182900ms step_avg:135.78ms | |
| step:1358/1395 train_time:183044ms step_avg:135.79ms | |
| step:1359/1395 train_time:183185ms step_avg:135.79ms | |
| step:1360/1395 train_time:183329ms step_avg:135.80ms | |
| step:1361/1395 train_time:183475ms step_avg:135.81ms | |
| step:1362/1395 train_time:183622ms step_avg:135.81ms | |
| step:1363/1395 train_time:183775ms step_avg:135.83ms | |
| step:1364/1395 train_time:183920ms step_avg:135.83ms | |
| step:1365/1395 train_time:184059ms step_avg:135.84ms | |
| step:1366/1395 train_time:184205ms step_avg:135.84ms | |
| step:1367/1395 train_time:184347ms step_avg:135.85ms | |
| step:1368/1395 train_time:184491ms step_avg:135.85ms | |
| step:1369/1395 train_time:184642ms step_avg:135.87ms | |
| step:1370/1395 train_time:184791ms step_avg:135.88ms | |
| step:1371/1395 train_time:184936ms step_avg:135.88ms | |
| step:1372/1395 train_time:185083ms step_avg:135.89ms | |
| step:1373/1395 train_time:185225ms step_avg:135.89ms | |
| step:1374/1395 train_time:185373ms step_avg:135.90ms | |
| step:1375/1395 train_time:185518ms step_avg:135.91ms | |
| step:1375/1395 val_loss:3.2814 train_time:185631ms step_avg:135.99ms | |
| step:1376/1395 train_time:185665ms step_avg:135.92ms | |
| step:1377/1395 train_time:185816ms step_avg:135.93ms | |
| step:1378/1395 train_time:185958ms step_avg:135.93ms | |
| step:1379/1395 train_time:186101ms step_avg:135.94ms | |
| step:1380/1395 train_time:186245ms step_avg:135.95ms | |
| step:1381/1395 train_time:186394ms step_avg:135.95ms | |
| step:1382/1395 train_time:186537ms step_avg:135.96ms | |
| step:1383/1395 train_time:186678ms step_avg:135.96ms | |
| step:1384/1395 train_time:186828ms step_avg:135.97ms | |
| step:1385/1395 train_time:186968ms step_avg:135.98ms | |
| step:1386/1395 train_time:187113ms step_avg:135.98ms | |
| step:1387/1395 train_time:187256ms step_avg:135.99ms | |
| step:1388/1395 train_time:187397ms step_avg:135.99ms | |
| step:1389/1395 train_time:187540ms step_avg:136.00ms | |
| step:1390/1395 train_time:187683ms step_avg:136.00ms | |
| step:1391/1395 train_time:187825ms step_avg:136.01ms | |
| step:1392/1395 train_time:187969ms step_avg:136.01ms | |
| step:1393/1395 train_time:188114ms step_avg:136.02ms | |
| step:1394/1395 train_time:188257ms step_avg:136.02ms | |
| step:1395/1395 train_time:188397ms step_avg:136.03ms | |
| step:1395/1395 val_loss:3.2770 train_time:188512ms step_avg:136.11ms | |
| peak memory allocated: 37620 MiB reserved: 39014 MiB | |