| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import time | |
| import copy | |
| import glob | |
| from dataclasses import dataclass | |
| from functools import lru_cache, partial # Added partial for hook registration | |
| from pathlib import Path | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems | |
| from torch import Tensor, nn, autocast | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| # use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
| #torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min | |
| #import wandb | |
| # ----------------------------------------------------------------------------- | |
| # Custom operators: FP8 matmul by @YouJiacheng | |
| @torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
| def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
| @torch.compile | |
| def impl(x: Tensor, w: Tensor): | |
| assert x.is_contiguous() and w.is_contiguous() | |
| x_f8 = x.div(x_s).to(torch.float8_e4m3fn) | |
| w_f8 = w.div(w_s).to(torch.float8_e4m3fn) | |
| out = torch._scaled_mm( | |
| x_f8, | |
| w_f8.T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=x.new_tensor(x_s, dtype=torch.float32), | |
| scale_b=x.new_tensor(w_s, dtype=torch.float32), | |
| use_fast_accum=True, | |
| ) | |
| return out, x_f8, w_f8 | |
| return impl(x, w) | |
| @mm_op.register_fake | |
| def _(x: Tensor, w: Tensor, *_): | |
| assert x.ndim == w.ndim == 2 | |
| assert x.shape[1] == w.shape[1] | |
| assert x.device == w.device | |
| assert x.is_contiguous() and w.is_contiguous() | |
| return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
| @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
| def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
| @torch.compile | |
| def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
| assert grad.is_contiguous() | |
| x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) | |
| w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) | |
| grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) | |
| grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) | |
| grad_x = torch._scaled_mm( | |
| grad_f8, | |
| w_f8.T.contiguous().T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=grad_inv_s, | |
| scale_b=w_inv_s, | |
| use_fast_accum=False, | |
| ) | |
| # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
| grad_w = torch._scaled_mm( | |
| x_f8.T.contiguous(), | |
| grad_f8.T.contiguous().T, | |
| out_dtype=torch.float32, | |
| scale_a=x_inv_s, | |
| scale_b=grad_inv_s, | |
| use_fast_accum=False, | |
| ).T | |
| return grad_x, grad_w | |
| return impl(g, x_f8, w_f8) | |
| @mm_backward_op.register_fake | |
| def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
| return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) | |
| def backward(ctx, grad_out: Tensor, *_): | |
| x_f8, w_f8 = ctx.saved_tensors | |
| x_s, w_s, grad_s = ctx.scales | |
| grad_x, grad_w = torch.ops.nanogpt.mm_backward( | |
| grad_out, x_f8, w_f8, x_s, w_s, grad_s | |
| ) | |
| return grad_x, grad_w, None, None, None | |
| def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
| *_, x_s, w_s, grad_s = inputs | |
| _, x_f8, w_f8 = output | |
| ctx.save_for_backward(x_f8, w_f8) | |
| ctx.scales = x_s, w_s, grad_s | |
| ctx.set_materialize_grads(False) | |
| mm_op.register_autograd(backward, setup_context=setup_context) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| @torch.compile(mode="reduce-overhead", fullgraph=True, dynamic=False) | |
| def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G.bfloat16() | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| # Ensure spectral norm is at most 1 | |
| X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) | |
| # Perform the NS iterations | |
| for _ in range(steps): | |
| A = X @ X.mT | |
| B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| return X.type_as(G) | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| https://kellerjordan.github.io/posts/muon/ | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, | |
| or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| """ | |
| def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, rank=0, world_size=1): | |
| self.rank = rank | |
| self.world_size = world_size | |
| defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum) | |
| params = list(params) | |
| sizes = {p.shape for p in params} | |
| # create one buffer per unique parameter-size | |
| param_groups = [] | |
| for size in sizes: | |
| group_params = [p for p in params if p.shape == size] | |
| param_groups.append(dict(params=group_params,)) | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| futures: list[torch.Future] = [] | |
| reduce_scatter_futures: list[torch.Future] = [] | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| grad = torch.empty_like(params[-1]) | |
| grad_pad = [param.grad for param in params] + [torch.zeros_like(params[-1])] * self.world_size | |
| for base_i in range(0, len(params), self.world_size): | |
| if base_i + self.rank < len(params): | |
| grad = params[base_i + self.rank].grad | |
| # This gives strange dynamo warnings | |
| reduce_scatter_futures.append(dist.reduce_scatter(grad, grad_pad[base_i:base_i + self.world_size], op=dist.ReduceOp.AVG, async_op=True).get_future()) | |
| idx = 0 | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| params_pad = params + [torch.empty_like(params[-1])] * self.world_size | |
| momentum = group["momentum"] | |
| for base_i in range(0, len(params), self.world_size): | |
| reduce_scatter_futures[idx].wait() | |
| if base_i + self.rank < len(params): | |
| p = params[base_i + self.rank] | |
| grad = p.grad | |
| eff_lr = group["lr"] * max(1, p.size(-2) / p.size(-1)) ** 0.5 * getattr(p, "lr_mul", 1.0) | |
| eff_weight_decay = group["lr"] * group["weight_decay"] * getattr(p, "wd_mul", 1.0) | |
| state = self.state[p] | |
| if len(state) == 0: | |
| state["momentum_buffer"] = torch.zeros_like(grad) | |
| momentum_buffer = state["momentum_buffer"] | |
| p.mul_(1 - eff_weight_decay) | |
| momentum_buffer.lerp_(grad, 1 - momentum) | |
| grad = grad.lerp_(momentum_buffer, momentum) | |
| v = zeropower_via_newtonschulz5(grad, 5) | |
| p.add_(other=v, alpha=-eff_lr) | |
| idx += 1 | |
| futures.append(dist.all_gather(params_pad[base_i:base_i + self.world_size], params_pad[base_i + self.rank], async_op=True).get_future()) | |
| # TODO: Check if commenting it is dangerous | |
| torch.futures.collect_all(futures).wait() | |
| class DistAdam(torch.optim.Optimizer): | |
| def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01, rank: int = 0, world_size: int = 1): | |
| defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) | |
| params = list(params) | |
| sizes = {p.shape for p in params} | |
| self.rank = rank | |
| self.world_size = world_size | |
| # create one buffer per unique parameter-size | |
| param_groups = [] | |
| for size in sizes: | |
| group_params = [p for p in params if p.shape == size] | |
| param_groups.append(dict( | |
| params=group_params, | |
| )) | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| futures: list[torch.Future] = [] | |
| reduce_scatter_futures: list[torch.Future] = [] | |
| grad_slices = [] | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| grad = torch.empty_like(params[-1]) | |
| for base_i in range(len(params)): | |
| grad = params[base_i].grad | |
| rank_size = grad.shape[0] // self.world_size | |
| grad_slice = torch.empty_like(grad[:rank_size]) | |
| reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) | |
| grad_slices.append(grad_slice) | |
| idx = 0 | |
| for group in self.param_groups: | |
| beta1, beta2 = group['betas'] | |
| eps = group['eps'] | |
| wd = group['weight_decay'] | |
| params = group['params'] | |
| for base in range(len(params)): | |
| reduce_scatter_futures[idx].wait() | |
| p = params[base] | |
| rank_size = p.shape[0] // self.world_size | |
| p_slice = p[rank * rank_size:(rank + 1) * rank_size] | |
| lr = group['lr'] * getattr(p, "lr_mul", 1.0) | |
| state = self.state[p] | |
| g_slice = grad_slices[idx] | |
| # State init | |
| if not state: | |
| state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device) | |
| state['exp_avg'] = torch.zeros_like(p_slice) | |
| state['exp_avg_sq'] = torch.zeros_like(p_slice) | |
| exp_avg = state['exp_avg'] | |
| exp_avg_sq = state['exp_avg_sq'] | |
| state['step'] += 1 | |
| t = state['step'] | |
| # weight decay | |
| if wd != 0: | |
| eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0) | |
| p_slice.mul_(1 - eff_weight_decay) | |
| # update running averages | |
| exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) | |
| exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) | |
| # bias corrections | |
| bias1 = 1 - beta1 ** t | |
| bias2 = 1 - beta2 ** t | |
| # compute step | |
| denom = exp_avg_sq.sqrt().add_(eps) | |
| step_size = lr * (torch.sqrt(bias2) / bias1) | |
| update = exp_avg.div(denom).mul_(step_size) | |
| p_slice.add_(other=update, alpha=-1.0) | |
| idx += 1 | |
| futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()) | |
| # TODO: Check if commenting it is dangerous | |
| torch.futures.collect_all(futures).wait() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the model | |
| def norm(x: Tensor): | |
| return F.rms_norm(x, (x.size(-1),)) | |
| class CastedLinear(nn.Linear): | |
| def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): | |
| super().__init__(in_features, out_features, bias=False) | |
| self.use_fp8 = use_fp8 | |
| self.x_s = x_s | |
| self.w_s = w_s | |
| self.grad_s = grad_s | |
| def reset_parameters(self) -> None: | |
| std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
| bound = (3 ** 0.5) * std | |
| with torch.no_grad(): | |
| self.weight.uniform_(-bound, bound) | |
| def forward(self, x: Tensor): | |
| if self.use_fp8 and self.training: | |
| _x = x.flatten(0, -2) | |
| out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] | |
| return out.reshape(*x.shape[:-1], -1) | |
| else: | |
| return F.linear(x, self.weight) | |
| class Rotary(nn.Module): | |
| def __init__(self, dim: int, max_seq_len: int): | |
| super().__init__() | |
| # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
| angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
| angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
| t = torch.arange(max_seq_len, dtype=torch.float32) | |
| theta = torch.einsum("i,j -> ij", t, angular_freq) | |
| self.cos = nn.Buffer(theta.cos(), persistent=False) | |
| self.sin = nn.Buffer(theta.sin(), persistent=False) | |
| def forward(self, x_BTHD: Tensor): | |
| assert self.cos.size(0) >= x_BTHD.size(-3) | |
| cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] | |
| x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128): | |
| super().__init__() | |
| self.num_heads = num_heads | |
| self.head_dim = head_dim | |
| hdim = num_heads * head_dim | |
| std = 0.5 * (dim ** -0.5) | |
| bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng | |
| # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng | |
| # https://x.com/hi_tysam/status/1879699187107033311 | |
| self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound)) | |
| self.rotary = Rotary(head_dim, max_seq_len) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| # scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun | |
| # inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 | |
| self.attn_scale = 0.12 | |
| def forward(self, x: Tensor, ve: Tensor | None, lambdas: Tensor, block_mask: BlockMask): | |
| B, T = x.size(0), x.size(1) # batch size, sequence length | |
| assert B == 1, "Must use batch size = 1 for FlexAttention" | |
| q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) | |
| q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
| q, k = self.rotary(q), self.rotary(k) | |
| if ve is not None: | |
| v = lambdas[0] * v + lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
| else: # skip mid-layers token value embeddings by @YouJiacheng | |
| v = lambdas[0] * v | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=self.attn_scale).transpose(1, 2) | |
| y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y | |
| class MLP(nn.Module): | |
| def __init__(self, dim: int): | |
| super().__init__() | |
| hdim = 4 * dim | |
| self.c_fc = CastedLinear(dim, hdim) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int): | |
| super().__init__() | |
| # skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
| self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None | |
| self.mlp = MLP(dim) | |
| def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, lambdas: Tensor, sa_lambdas: Tensor, block_mask: BlockMask): | |
| x = lambdas[0] * x + lambdas[1] * x0 | |
| if self.attn is not None: | |
| x = x + self.attn(norm(x), ve, sa_lambdas, block_mask) | |
| x = x + self.mlp(norm(x)) | |
| return x | |
| # ----------------------------------------------------------------------------- | |
| # The main model | |
| def next_multiple_of_n(v: float | int, *, n: int): | |
| return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
| class GPT(nn.Module): | |
| def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int): | |
| super().__init__() | |
| self.embed = nn.Embedding(vocab_size, model_dim) | |
| for param in self.embed.parameters(): | |
| param.lr_mul = 75. | |
| # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 | |
| # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 | |
| self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
| for embeds in self.value_embeds: | |
| for param in self.value_embeds.parameters(): | |
| param.lr_mul = 75. | |
| self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)]) | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
| # suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
| self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128), use_fp8=True, x_s=(model_dim**0.5)/448, w_s=24/448, grad_s=1/448) | |
| self.lm_head.weight.lr_mul = 27.5 | |
| self.lm_head.weight.detach().zero_() # @Grad62304977 | |
| # Add learnable skip connection weights for decoder layers | |
| assert num_layers % 2 == 0 | |
| pad = (-num_layers * 5) % world_size | |
| self.scalars = nn.Parameter(torch.cat([ | |
| torch.ones(num_layers), # skip_weights | |
| *[torch.tensor([1.0, 0.0]) for _ in range(num_layers)], # block lambdas | |
| *[torch.tensor([0.5, 0.5]) for _ in range(num_layers)], # SA lambdas | |
| torch.ones(pad), | |
| ])) | |
| self.scalars.lr_mul = 5.0 | |
| def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| BLOCK_SIZE = 128 | |
| docs = (input_seq == 50256).cumsum(0) | |
| def document_causal(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| #return causal_mask | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| return causal_mask & document_mask | |
| def dense_to_ordered(dense_blockmask: Tensor): | |
| num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32) | |
| indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) | |
| return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
| # manual block mask creation by @YouJiacheng | |
| assert len(input_seq) % BLOCK_SIZE == 0 | |
| NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
| block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
| causal_blockmask_any = block_idx[:, None] >= block_idx | |
| causal_blockmask_all = block_idx[:, None] > block_idx | |
| docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
| docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
| document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low) | |
| document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low) | |
| blockmask_any = causal_blockmask_any & document_blockmask_any | |
| blockmask_all = causal_blockmask_all & document_blockmask_all | |
| partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all) | |
| full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all) | |
| def build_bm(window_size_blocks: Tensor) -> BlockMask: | |
| return BlockMask.from_kv_blocks( | |
| torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)), | |
| partial_kv_indices, | |
| torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1), | |
| full_kv_indices, | |
| BLOCK_SIZE=BLOCK_SIZE, | |
| mask_mod=document_causal, | |
| ) | |
| # Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper | |
| return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) | |
| def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| assert input_seq.ndim == 1 | |
| ve = [value_embed(input_seq) for value_embed in self.value_embeds] | |
| # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
| ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] | |
| assert len(ve) == len(self.blocks) | |
| long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks) | |
| block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm] | |
| assert len(block_masks) == len(self.blocks) | |
| x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
| # U-net design by @brendanh0gan | |
| skip_connections = [] | |
| skip_weights = self.scalars[:(len(self.blocks) // 2)] | |
| lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) | |
| sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) | |
| n = len(self.blocks) // 2 | |
| for i in range(len(self.blocks)): | |
| if i >= n: | |
| x = x + skip_weights[i - n] * skip_connections.pop() | |
| x = self.blocks[i](x, ve[i], x0, lambdas[i], sa_lambdas[i], block_masks[i]) | |
| if i < n: | |
| skip_connections.append(x) | |
| x = norm(x) | |
| logits = self.lm_head(x).float() | |
| # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
| logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1)**0.5)) | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction='sum' if self.training else 'mean') | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Our own simple Distributed Data Loader | |
| def _load_data_shard(file: Path): | |
| header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 | |
| assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
| assert header[1] == 1, "unsupported version" | |
| num_tokens = int(header[2]) # number of tokens (claimed) | |
| with file.open("rb", buffering=0) as f: | |
| tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
| f.seek(256 * 4) | |
| nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
| assert nbytes == 2 * num_tokens, "number of tokens read does not match header" | |
| return tokens | |
| def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int): | |
| files = [Path(file) for file in sorted(glob.glob(filename_pattern))] | |
| assert batch_size % world_size == 0 | |
| local_batch_size = batch_size // world_size | |
| file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| while True: | |
| if pos + batch_size + 1 >= len(tokens): | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1] | |
| inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
| targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
| pos += batch_size | |
| yield inputs, targets | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data | |
| train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on | |
| val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on | |
| val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| # optimization | |
| num_iterations = 1770 # number of iterations to run | |
| cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
| # evaluation and logging | |
| val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| # implementation | |
| seq_len = 48*1024 # FlexAttention sequence length | |
| val_seq_len = 4*64*1024 # FlexAttention sequence length for validation | |
| save_checkpoint = False | |
| args = Hyperparameters() | |
| # torchrun sets these env variables | |
| rank = int(os.environ["RANK"]) | |
| world_size = int(os.environ["WORLD_SIZE"]) | |
| assert torch.cuda.is_available() | |
| device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) | |
| torch.cuda.set_device(device) | |
| dist.init_process_group(backend="nccl", device_id=device) | |
| dist.barrier() | |
| master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
| #if master_process: | |
| # wandb.init(project="modded-nanogpt-tiny", name=f"run-{os.path.basename(__file__)}", save_code=True) | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = uuid.uuid4() | |
| os.makedirs("logs", exist_ok=True) | |
| logfile = f"logs/{run_id}.txt" | |
| print(logfile) | |
| def print0(s, console=True): | |
| if master_process: | |
| with open(logfile, "a") as f: | |
| if console: | |
| print(s) | |
| print(s, file=f) | |
| # begin by printing this file (the Python code) | |
| print0(code) | |
| print0("="*100) | |
| # log information about the hardware/software environment this is running on | |
| print0(f"Running Python {sys.version}") | |
| print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") | |
| def nvidia_smi(): | |
| import subprocess # avoid top level import | |
| return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
| if master_process: | |
| print0(nvidia_smi()) | |
| print0("="*100) | |
| model: nn.Module = GPT(vocab_size=next_multiple_of_n(50257, n=128), num_layers=12, num_heads=6, model_dim=768, max_seq_len=max(args.seq_len, args.val_seq_len)).cuda() | |
| for m in model.modules(): | |
| if isinstance(m, nn.Embedding): | |
| m.bfloat16() | |
| for param in model.parameters(): | |
| dist.broadcast(param.detach(), 0) | |
| # collect the parameters to optimize | |
| hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n] | |
| embed_params = [p for n, p in model.named_parameters() if "embed" in n] | |
| scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
| head_params = [model.lm_head.weight] | |
| # init the optimizer(s) | |
| # small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence | |
| # discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 | |
| optimizer1 = DistAdam(scalar_params + head_params + embed_params, lr=0.008, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0, rank=rank, world_size=world_size) | |
| optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size, weight_decay=0.0) | |
| optimizers = [optimizer1, optimizer2] | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["initial_lr"] = group["lr"] | |
| for n, p in model.named_parameters(): | |
| wd_mul = getattr(p, "wd_mul", 1.0) | |
| lr_mul = getattr(p, "lr_mul", 1.0) | |
| print0(f"{n}: {p.shape} {p.dtype} {wd_mul} {lr_mul}") | |
| # Count parameters | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| embedding_params = sum(p.numel() for n, p in model.named_parameters() if "embed" in n) | |
| non_embedding_params = total_params - embedding_params | |
| print0(f"") | |
| print0(f"Model parameters:") | |
| print0(f" Total parameters: {total_params:,}") | |
| print0(f" Embedding parameters: {embedding_params:,}") | |
| print0(f" Non-embedding parameters: {non_embedding_params:,}") | |
| # learning rate schedule: stable then decay | |
| def get_lr(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x <= 1 | |
| w = min((1 - x) / args.cooldown_frac, 1.0) # 1 -> 0 | |
| return w * 1.0 + (1 - w) * 0.1 | |
| @lru_cache(1) | |
| def get_window_size_blocks_helper(window_size: int): | |
| return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
| def get_window_size_blocks(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x <= 1 | |
| # Linearly increase the block-wise sliding window size over training 128 -> 1792 | |
| # increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
| window_size = next_multiple_of_n(1728 * x, n=128) | |
| return get_window_size_blocks_helper(window_size) | |
| model: nn.Module = torch.compile(model, mode="reduce-overhead", fullgraph=True, dynamic=False) | |
| # Warmup the training kernels, then re-initialize the state so we aren't cheating | |
| warmup_steps = 10 | |
| initial_state = dict(model=copy.deepcopy(model.state_dict()), | |
| optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.seq_len, rank, world_size) | |
| for _ in range(warmup_steps): | |
| inputs, targets = next(train_loader) | |
| torch.compiler.cudagraph_mark_step_begin() | |
| with autocast(device_type="cuda", dtype=torch.bfloat16): | |
| loss = model(inputs, targets, get_window_size_blocks(1)) | |
| loss.backward() | |
| for opt in optimizers: | |
| opt.step() | |
| model.zero_grad(set_to_none=True) | |
| torch.cuda.synchronize() | |
| dist.barrier() | |
| with torch.profiler.profile() as prof: | |
| for _ in range(warmup_steps): | |
| torch.compiler.cudagraph_mark_step_begin() | |
| inputs, targets = next(train_loader) | |
| with autocast(device_type="cuda", dtype=torch.bfloat16): | |
| loss = model(inputs, targets, get_window_size_blocks(1)) | |
| loss.backward() | |
| for opt in optimizers: | |
| opt.step() | |
| model.zero_grad(set_to_none=True) | |
| torch.cuda.synchronize() | |
| dist.barrier() | |
| os.makedirs("traces", exist_ok=True) | |
| prof.export_chrome_trace(f"traces/trace_{rank}.json") | |
| model.load_state_dict(initial_state['model']) | |
| for opt, opt_state in zip(optimizers, initial_state['optimizers']): | |
| opt.load_state_dict(opt_state) | |
| del train_loader, initial_state | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.seq_len, rank, world_size) | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| # begin training | |
| train_steps = args.num_iterations | |
| for step in range(train_steps + 1): | |
| last_step = (step == train_steps) | |
| torch.compiler.cudagraph_mark_step_begin() | |
| # --------------- VALIDATION SECTION ----------------- | |
| if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.perf_counter() - t0) | |
| model.eval() | |
| val_batch_size = world_size * args.val_seq_len | |
| assert args.val_tokens % val_batch_size == 0 | |
| val_steps = args.val_tokens // val_batch_size | |
| val_loader = distributed_data_generator(args.val_files, val_batch_size, rank, world_size) | |
| val_loss = 0 | |
| with torch.no_grad(): | |
| for _ in range(val_steps): | |
| inputs, targets = next(val_loader) | |
| with autocast(device_type="cuda", dtype=torch.bfloat16): | |
| val_loss += model(inputs, targets, get_window_size_blocks(step)) | |
| val_loss /= val_steps | |
| del val_loader | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| #if master_process: | |
| # wandb.log({"val/loss": val_loss}, step=step) | |
| print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) | |
| model.train() | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| if last_step: | |
| if master_process and args.save_checkpoint: | |
| log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| os.makedirs(f"logs/{run_id}", exist_ok=True) | |
| torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") | |
| # the last step only has the validation loop, so break to avoid training | |
| break | |
| # --------------- TRAINING SECTION ----------------- | |
| inputs, targets = next(train_loader) | |
| with autocast(device_type="cuda", dtype=torch.bfloat16): | |
| loss = model(inputs, targets, get_window_size_blocks(step)) | |
| loss.backward() | |
| # set optimization hyperparameters | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["lr"] = group["initial_lr"] * get_lr(step) | |
| frac = min(step / 300, 1) | |
| for group in optimizer2.param_groups: | |
| group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 | |
| # step the optimizers and schedulers | |
| for opt in optimizers: | |
| opt.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # logging | |
| approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) | |
| print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) | |
| print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
| f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running Python 3.12.3 (main, Feb 4 2025, 14:48:35) [GCC 13.3.0] | |
| Running PyTorch 2.7.0a0+79aa17489c.nv25.04 compiled for CUDA 12.9 | |
| Fri May 30 12:25:55 2025 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 550.127.05 Driver Version: 550.127.05 CUDA Version: 12.9 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 On | 00000000:04:00.0 Off | 0 | | |
| | N/A 44C P0 129W / 700W | 5856MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 1 NVIDIA H100 80GB HBM3 On | 00000000:05:00.0 Off | 0 | | |
| | N/A 39C P0 126W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 2 NVIDIA H100 80GB HBM3 On | 00000000:0B:00.0 Off | 0 | | |
| | N/A 45C P0 132W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 3 NVIDIA H100 80GB HBM3 On | 00000000:0C:00.0 Off | 0 | | |
| | N/A 38C P0 124W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 4 NVIDIA H100 80GB HBM3 On | 00000000:84:00.0 Off | 0 | | |
| | N/A 44C P0 139W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 5 NVIDIA H100 80GB HBM3 On | 00000000:85:00.0 Off | 0 | | |
| | N/A 37C P0 117W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 6 NVIDIA H100 80GB HBM3 On | 00000000:8B:00.0 Off | 0 | | |
| | N/A 41C P0 119W / 700W | 1518MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 7 NVIDIA H100 80GB HBM3 On | 00000000:8C:00.0 Off | 0 | | |
| | N/A 38C P0 117W / 700W | 1518MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| scalars: torch.Size([64]) torch.float32 1.0 5.0 | |
| embed.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0 | |
| value_embeds.0.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0 | |
| value_embeds.1.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0 | |
| value_embeds.2.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0 | |
| blocks.0.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.0.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.0.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.0.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.1.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.1.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.1.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.1.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.2.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.2.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.2.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.2.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.3.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.3.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.3.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.3.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.4.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.4.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.4.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.4.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.5.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.5.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.5.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.5.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.6.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.6.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.6.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.6.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.7.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.7.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.8.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.8.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.8.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.8.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.9.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.9.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.9.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.9.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.10.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.10.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.10.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.10.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| blocks.11.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0 | |
| blocks.11.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0 | |
| blocks.11.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0 | |
| blocks.11.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0 | |
| lm_head.weight: torch.Size([50304, 768]) torch.float32 1.0 27.5 | |
| Model parameters: | |
| Total parameters: 275,742,784 | |
| Embedding parameters: 154,533,888 | |
| Non-embedding parameters: 121,208,896 | |
| step:0/1770 val_loss:10.8258 train_time:0ms step_avg:0.03ms | |
| step:1/1770 train_time:157ms step_avg:157.17ms | |
| step:2/1770 train_time:170ms step_avg:84.81ms | |
| step:3/1770 train_time:178ms step_avg:59.47ms | |
| step:4/1770 train_time:187ms step_avg:46.80ms | |
| step:5/1770 train_time:260ms step_avg:52.10ms | |
| step:6/1770 train_time:353ms step_avg:58.84ms | |
| step:7/1770 train_time:446ms step_avg:63.76ms | |
| step:8/1770 train_time:539ms step_avg:67.37ms | |
| step:9/1770 train_time:633ms step_avg:70.32ms | |
| step:10/1770 train_time:726ms step_avg:72.56ms | |
| step:11/1770 train_time:818ms step_avg:74.41ms | |
| step:12/1770 train_time:912ms step_avg:75.96ms | |
| step:13/1770 train_time:1005ms step_avg:77.32ms | |
| step:14/1770 train_time:1102ms step_avg:78.72ms | |
| step:15/1770 train_time:1199ms step_avg:79.96ms | |
| step:16/1770 train_time:1295ms step_avg:80.92ms | |
| step:17/1770 train_time:1389ms step_avg:81.69ms | |
| step:18/1770 train_time:1483ms step_avg:82.41ms | |
| step:19/1770 train_time:1577ms step_avg:82.99ms | |
| step:20/1770 train_time:1670ms step_avg:83.52ms | |
| step:21/1770 train_time:1764ms step_avg:84.00ms | |
| step:22/1770 train_time:1857ms step_avg:84.41ms | |
| step:23/1770 train_time:1951ms step_avg:84.81ms | |
| step:24/1770 train_time:2046ms step_avg:85.25ms | |
| step:25/1770 train_time:2142ms step_avg:85.67ms | |
| step:26/1770 train_time:2238ms step_avg:86.06ms | |
| step:27/1770 train_time:2333ms step_avg:86.42ms | |
| step:28/1770 train_time:2428ms step_avg:86.71ms | |
| step:29/1770 train_time:2522ms step_avg:86.97ms | |
| step:30/1770 train_time:2616ms step_avg:87.20ms | |
| step:31/1770 train_time:2709ms step_avg:87.40ms | |
| step:32/1770 train_time:2803ms step_avg:87.60ms | |
| step:33/1770 train_time:2897ms step_avg:87.78ms | |
| step:34/1770 train_time:2992ms step_avg:87.99ms | |
| step:35/1770 train_time:3086ms step_avg:88.18ms | |
| step:36/1770 train_time:3181ms step_avg:88.35ms | |
| step:37/1770 train_time:3276ms step_avg:88.55ms | |
| step:38/1770 train_time:3372ms step_avg:88.73ms | |
| step:39/1770 train_time:3466ms step_avg:88.88ms | |
| step:40/1770 train_time:3560ms step_avg:89.01ms | |
| step:41/1770 train_time:3654ms step_avg:89.13ms | |
| step:42/1770 train_time:3748ms step_avg:89.25ms | |
| step:43/1770 train_time:3841ms step_avg:89.33ms | |
| step:44/1770 train_time:3936ms step_avg:89.46ms | |
| step:45/1770 train_time:4030ms step_avg:89.57ms | |
| step:46/1770 train_time:4125ms step_avg:89.68ms | |
| step:47/1770 train_time:4219ms step_avg:89.77ms | |
| step:48/1770 train_time:4315ms step_avg:89.89ms | |
| step:49/1770 train_time:4411ms step_avg:90.01ms | |
| step:50/1770 train_time:4506ms step_avg:90.12ms | |
| step:51/1770 train_time:4599ms step_avg:90.17ms | |
| step:52/1770 train_time:4693ms step_avg:90.26ms | |
| step:53/1770 train_time:4788ms step_avg:90.33ms | |
| step:54/1770 train_time:4882ms step_avg:90.41ms | |
| step:55/1770 train_time:4976ms step_avg:90.48ms | |
| step:56/1770 train_time:5072ms step_avg:90.57ms | |
| step:57/1770 train_time:5166ms step_avg:90.63ms | |
| step:58/1770 train_time:5260ms step_avg:90.69ms | |
| step:59/1770 train_time:5355ms step_avg:90.77ms | |
| step:60/1770 train_time:5451ms step_avg:90.85ms | |
| step:61/1770 train_time:5545ms step_avg:90.91ms | |
| step:62/1770 train_time:5639ms step_avg:90.95ms | |
| step:63/1770 train_time:5733ms step_avg:91.01ms | |
| step:64/1770 train_time:5827ms step_avg:91.05ms | |
| step:65/1770 train_time:5921ms step_avg:91.09ms | |
| step:66/1770 train_time:6016ms step_avg:91.15ms | |
| step:67/1770 train_time:6110ms step_avg:91.20ms | |
| step:68/1770 train_time:6204ms step_avg:91.24ms | |
| step:69/1770 train_time:6299ms step_avg:91.28ms | |
| step:70/1770 train_time:6394ms step_avg:91.34ms | |
| step:71/1770 train_time:6489ms step_avg:91.40ms | |
| step:72/1770 train_time:6583ms step_avg:91.43ms | |
| step:73/1770 train_time:6677ms step_avg:91.47ms | |
| step:74/1770 train_time:6772ms step_avg:91.51ms | |
| step:75/1770 train_time:6866ms step_avg:91.55ms | |
| step:76/1770 train_time:6960ms step_avg:91.58ms | |
| step:77/1770 train_time:7056ms step_avg:91.63ms | |
| step:78/1770 train_time:7151ms step_avg:91.67ms | |
| step:79/1770 train_time:7244ms step_avg:91.70ms | |
| step:80/1770 train_time:7338ms step_avg:91.72ms | |
| step:81/1770 train_time:7434ms step_avg:91.77ms | |
| step:82/1770 train_time:7528ms step_avg:91.80ms | |
| step:83/1770 train_time:7621ms step_avg:91.82ms | |
| step:84/1770 train_time:7717ms step_avg:91.87ms | |
| step:85/1770 train_time:7812ms step_avg:91.91ms | |
| step:86/1770 train_time:7907ms step_avg:91.94ms | |
| step:87/1770 train_time:8001ms step_avg:91.96ms | |
| step:88/1770 train_time:8095ms step_avg:91.99ms | |
| step:89/1770 train_time:8191ms step_avg:92.03ms | |
| step:90/1770 train_time:8284ms step_avg:92.05ms | |
| step:91/1770 train_time:8379ms step_avg:92.08ms | |
| step:92/1770 train_time:8474ms step_avg:92.11ms | |
| step:93/1770 train_time:8568ms step_avg:92.13ms | |
| step:94/1770 train_time:8663ms step_avg:92.16ms | |
| step:95/1770 train_time:8757ms step_avg:92.18ms | |
| step:96/1770 train_time:8851ms step_avg:92.20ms | |
| step:97/1770 train_time:8944ms step_avg:92.21ms | |
| step:98/1770 train_time:9038ms step_avg:92.23ms | |
| step:99/1770 train_time:9134ms step_avg:92.26ms | |
| step:100/1770 train_time:9228ms step_avg:92.28ms | |
| step:101/1770 train_time:9322ms step_avg:92.30ms | |
| step:102/1770 train_time:9416ms step_avg:92.32ms | |
| step:103/1770 train_time:9511ms step_avg:92.34ms | |
| step:104/1770 train_time:9605ms step_avg:92.36ms | |
| step:105/1770 train_time:9699ms step_avg:92.37ms | |
| step:106/1770 train_time:9794ms step_avg:92.39ms | |
| step:107/1770 train_time:9887ms step_avg:92.41ms | |
| step:108/1770 train_time:9982ms step_avg:92.42ms | |
| step:109/1770 train_time:10076ms step_avg:92.44ms | |
| step:110/1770 train_time:10171ms step_avg:92.46ms | |
| step:111/1770 train_time:10266ms step_avg:92.48ms | |
| step:112/1770 train_time:10360ms step_avg:92.50ms | |
| step:113/1770 train_time:10455ms step_avg:92.52ms | |
| step:114/1770 train_time:10549ms step_avg:92.54ms | |
| step:115/1770 train_time:10643ms step_avg:92.55ms | |
| step:116/1770 train_time:10737ms step_avg:92.56ms | |
| step:117/1770 train_time:10832ms step_avg:92.58ms | |
| step:118/1770 train_time:10926ms step_avg:92.60ms | |
| step:119/1770 train_time:11020ms step_avg:92.61ms | |
| step:120/1770 train_time:11115ms step_avg:92.63ms | |
| step:121/1770 train_time:11210ms step_avg:92.64ms | |
| step:122/1770 train_time:11305ms step_avg:92.66ms | |
| step:123/1770 train_time:11399ms step_avg:92.68ms | |
| step:124/1770 train_time:11494ms step_avg:92.69ms | |
| step:125/1770 train_time:11589ms step_avg:92.71ms | |
| step:125/1770 val_loss:4.6449 train_time:11860ms step_avg:94.88ms | |
| step:126/1770 train_time:11962ms step_avg:94.94ms | |
| step:127/1770 train_time:11986ms step_avg:94.38ms | |
| step:128/1770 train_time:12059ms step_avg:94.21ms | |
| step:129/1770 train_time:12100ms step_avg:93.80ms | |
| step:130/1770 train_time:12157ms step_avg:93.52ms | |
| step:131/1770 train_time:12217ms step_avg:93.26ms | |
| step:132/1770 train_time:12258ms step_avg:92.86ms | |
| step:133/1770 train_time:12351ms step_avg:92.86ms | |
| step:134/1770 train_time:12444ms step_avg:92.87ms | |
| step:135/1770 train_time:12537ms step_avg:92.87ms | |
| step:136/1770 train_time:12631ms step_avg:92.88ms | |
| step:137/1770 train_time:12727ms step_avg:92.90ms | |
| step:138/1770 train_time:12826ms step_avg:92.94ms | |
| step:139/1770 train_time:12923ms step_avg:92.97ms | |
| step:140/1770 train_time:13018ms step_avg:92.99ms | |
| step:141/1770 train_time:13114ms step_avg:93.00ms | |
| step:142/1770 train_time:13208ms step_avg:93.02ms | |
| step:143/1770 train_time:13303ms step_avg:93.03ms | |
| step:144/1770 train_time:13397ms step_avg:93.04ms | |
| step:145/1770 train_time:13491ms step_avg:93.04ms | |
| step:146/1770 train_time:13585ms step_avg:93.05ms | |
| step:147/1770 train_time:13680ms step_avg:93.06ms | |
| step:148/1770 train_time:13776ms step_avg:93.08ms | |
| step:149/1770 train_time:13873ms step_avg:93.11ms | |
| step:150/1770 train_time:13971ms step_avg:93.14ms | |
| step:151/1770 train_time:14068ms step_avg:93.17ms | |
| step:152/1770 train_time:14163ms step_avg:93.18ms | |
| step:153/1770 train_time:14257ms step_avg:93.18ms | |
| step:154/1770 train_time:14352ms step_avg:93.19ms | |
| step:155/1770 train_time:14447ms step_avg:93.21ms | |
| step:156/1770 train_time:14541ms step_avg:93.21ms | |
| step:157/1770 train_time:14635ms step_avg:93.22ms | |
| step:158/1770 train_time:14731ms step_avg:93.23ms | |
| step:159/1770 train_time:14828ms step_avg:93.26ms | |
| step:160/1770 train_time:14924ms step_avg:93.27ms | |
| step:161/1770 train_time:15018ms step_avg:93.28ms | |
| step:162/1770 train_time:15113ms step_avg:93.29ms | |
| step:163/1770 train_time:15209ms step_avg:93.31ms | |
| step:164/1770 train_time:15304ms step_avg:93.32ms | |
| step:165/1770 train_time:15399ms step_avg:93.33ms | |
| step:166/1770 train_time:15493ms step_avg:93.33ms | |
| step:167/1770 train_time:15588ms step_avg:93.34ms | |
| step:168/1770 train_time:15683ms step_avg:93.35ms | |
| step:169/1770 train_time:15778ms step_avg:93.36ms | |
| step:170/1770 train_time:15873ms step_avg:93.37ms | |
| step:171/1770 train_time:15971ms step_avg:93.39ms | |
| step:172/1770 train_time:16066ms step_avg:93.41ms | |
| step:173/1770 train_time:16161ms step_avg:93.42ms | |
| step:174/1770 train_time:16256ms step_avg:93.42ms | |
| step:175/1770 train_time:16351ms step_avg:93.43ms | |
| step:176/1770 train_time:16447ms step_avg:93.45ms | |
| step:177/1770 train_time:16541ms step_avg:93.45ms | |
| step:178/1770 train_time:16635ms step_avg:93.46ms | |
| step:179/1770 train_time:16731ms step_avg:93.47ms | |
| step:180/1770 train_time:16828ms step_avg:93.49ms | |
| step:181/1770 train_time:16924ms step_avg:93.50ms | |
| step:182/1770 train_time:17019ms step_avg:93.51ms | |
| step:183/1770 train_time:17114ms step_avg:93.52ms | |
| step:184/1770 train_time:17210ms step_avg:93.53ms | |
| step:185/1770 train_time:17305ms step_avg:93.54ms | |
| step:186/1770 train_time:17399ms step_avg:93.54ms | |
| step:187/1770 train_time:17494ms step_avg:93.55ms | |
| step:188/1770 train_time:17589ms step_avg:93.56ms | |
| step:189/1770 train_time:17684ms step_avg:93.57ms | |
| step:190/1770 train_time:17779ms step_avg:93.57ms | |
| step:191/1770 train_time:17874ms step_avg:93.58ms | |
| step:192/1770 train_time:17970ms step_avg:93.59ms | |
| step:193/1770 train_time:18066ms step_avg:93.61ms | |
| step:194/1770 train_time:18162ms step_avg:93.62ms | |
| step:195/1770 train_time:18257ms step_avg:93.62ms | |
| step:196/1770 train_time:18351ms step_avg:93.63ms | |
| step:197/1770 train_time:18447ms step_avg:93.64ms | |
| step:198/1770 train_time:18542ms step_avg:93.65ms | |
| step:199/1770 train_time:18637ms step_avg:93.65ms | |
| step:200/1770 train_time:18731ms step_avg:93.65ms | |
| step:201/1770 train_time:18827ms step_avg:93.67ms | |
| step:202/1770 train_time:18923ms step_avg:93.68ms | |
| step:203/1770 train_time:19018ms step_avg:93.68ms | |
| step:204/1770 train_time:19113ms step_avg:93.69ms | |
| step:205/1770 train_time:19210ms step_avg:93.71ms | |
| step:206/1770 train_time:19305ms step_avg:93.71ms | |
| step:207/1770 train_time:19400ms step_avg:93.72ms | |
| step:208/1770 train_time:19495ms step_avg:93.72ms | |
| step:209/1770 train_time:19590ms step_avg:93.73ms | |
| step:210/1770 train_time:19685ms step_avg:93.74ms | |
| step:211/1770 train_time:19779ms step_avg:93.74ms | |
| step:212/1770 train_time:19874ms step_avg:93.75ms | |
| step:213/1770 train_time:19970ms step_avg:93.76ms | |
| step:214/1770 train_time:20067ms step_avg:93.77ms | |
| step:215/1770 train_time:20163ms step_avg:93.78ms | |
| step:216/1770 train_time:20258ms step_avg:93.79ms | |
| step:217/1770 train_time:20352ms step_avg:93.79ms | |
| step:218/1770 train_time:20449ms step_avg:93.80ms | |
| step:219/1770 train_time:20544ms step_avg:93.81ms | |
| step:220/1770 train_time:20640ms step_avg:93.82ms | |
| step:221/1770 train_time:20735ms step_avg:93.82ms | |
| step:222/1770 train_time:20831ms step_avg:93.83ms | |
| step:223/1770 train_time:20926ms step_avg:93.84ms | |
| step:224/1770 train_time:21022ms step_avg:93.85ms | |
| step:225/1770 train_time:21117ms step_avg:93.85ms | |
| step:226/1770 train_time:21212ms step_avg:93.86ms | |
| step:227/1770 train_time:21309ms step_avg:93.87ms | |
| step:228/1770 train_time:21404ms step_avg:93.88ms | |
| step:229/1770 train_time:21498ms step_avg:93.88ms | |
| step:230/1770 train_time:21593ms step_avg:93.88ms | |
| step:231/1770 train_time:21688ms step_avg:93.89ms | |
| step:232/1770 train_time:21784ms step_avg:93.89ms | |
| step:233/1770 train_time:21878ms step_avg:93.90ms | |
| step:234/1770 train_time:21974ms step_avg:93.91ms | |
| step:235/1770 train_time:22070ms step_avg:93.92ms | |
| step:236/1770 train_time:22166ms step_avg:93.92ms | |
| step:237/1770 train_time:22261ms step_avg:93.93ms | |
| step:238/1770 train_time:22356ms step_avg:93.93ms | |
| step:239/1770 train_time:22450ms step_avg:93.93ms | |
| step:240/1770 train_time:22546ms step_avg:93.94ms | |
| step:241/1770 train_time:22640ms step_avg:93.94ms | |
| step:242/1770 train_time:22735ms step_avg:93.95ms | |
| step:243/1770 train_time:22831ms step_avg:93.96ms | |
| step:244/1770 train_time:22928ms step_avg:93.97ms | |
| step:245/1770 train_time:23024ms step_avg:93.98ms | |
| step:246/1770 train_time:23119ms step_avg:93.98ms | |
| step:247/1770 train_time:23214ms step_avg:93.98ms | |
| step:248/1770 train_time:23311ms step_avg:93.99ms | |
| step:249/1770 train_time:23407ms step_avg:94.00ms | |
| step:250/1770 train_time:23502ms step_avg:94.01ms | |
| step:250/1770 val_loss:4.1038 train_time:23775ms step_avg:95.10ms | |
| step:251/1770 train_time:23786ms step_avg:94.76ms | |
| step:252/1770 train_time:23795ms step_avg:94.42ms | |
| step:253/1770 train_time:23804ms step_avg:94.09ms | |
| step:254/1770 train_time:23891ms step_avg:94.06ms | |
| step:255/1770 train_time:23988ms step_avg:94.07ms | |
| step:256/1770 train_time:24085ms step_avg:94.08ms | |
| step:257/1770 train_time:24182ms step_avg:94.09ms | |
| step:258/1770 train_time:24277ms step_avg:94.10ms | |
| step:259/1770 train_time:24370ms step_avg:94.09ms | |
| step:260/1770 train_time:24465ms step_avg:94.09ms | |
| step:261/1770 train_time:24560ms step_avg:94.10ms | |
| step:262/1770 train_time:24653ms step_avg:94.10ms | |
| step:263/1770 train_time:24749ms step_avg:94.10ms | |
| step:264/1770 train_time:24847ms step_avg:94.12ms | |
| step:265/1770 train_time:24945ms step_avg:94.13ms | |
| step:266/1770 train_time:25042ms step_avg:94.14ms | |
| step:267/1770 train_time:25139ms step_avg:94.15ms | |
| step:268/1770 train_time:25235ms step_avg:94.16ms | |
| step:269/1770 train_time:25331ms step_avg:94.17ms | |
| step:270/1770 train_time:25425ms step_avg:94.17ms | |
| step:271/1770 train_time:25520ms step_avg:94.17ms | |
| step:272/1770 train_time:25616ms step_avg:94.18ms | |
| step:273/1770 train_time:25711ms step_avg:94.18ms | |
| step:274/1770 train_time:25807ms step_avg:94.19ms | |
| step:275/1770 train_time:25906ms step_avg:94.20ms | |
| step:276/1770 train_time:26004ms step_avg:94.22ms | |
| step:277/1770 train_time:26100ms step_avg:94.23ms | |
| step:278/1770 train_time:26197ms step_avg:94.23ms | |
| step:279/1770 train_time:26292ms step_avg:94.24ms | |
| step:280/1770 train_time:26388ms step_avg:94.24ms | |
| step:281/1770 train_time:26484ms step_avg:94.25ms | |
| step:282/1770 train_time:26581ms step_avg:94.26ms | |
| step:283/1770 train_time:26677ms step_avg:94.26ms | |
| step:284/1770 train_time:26773ms step_avg:94.27ms | |
| step:285/1770 train_time:26870ms step_avg:94.28ms | |
| step:286/1770 train_time:26967ms step_avg:94.29ms | |
| step:287/1770 train_time:27064ms step_avg:94.30ms | |
| step:288/1770 train_time:27160ms step_avg:94.31ms | |
| step:289/1770 train_time:27257ms step_avg:94.32ms | |
| step:290/1770 train_time:27352ms step_avg:94.32ms | |
| step:291/1770 train_time:27448ms step_avg:94.32ms | |
| step:292/1770 train_time:27544ms step_avg:94.33ms | |
| step:293/1770 train_time:27640ms step_avg:94.33ms | |
| step:294/1770 train_time:27736ms step_avg:94.34ms | |
| step:295/1770 train_time:27832ms step_avg:94.35ms | |
| step:296/1770 train_time:27928ms step_avg:94.35ms | |
| step:297/1770 train_time:28025ms step_avg:94.36ms | |
| step:298/1770 train_time:28122ms step_avg:94.37ms | |
| step:299/1770 train_time:28219ms step_avg:94.38ms | |
| step:300/1770 train_time:28314ms step_avg:94.38ms | |
| step:301/1770 train_time:28409ms step_avg:94.38ms | |
| step:302/1770 train_time:28505ms step_avg:94.39ms | |
| step:303/1770 train_time:28602ms step_avg:94.40ms | |
| step:304/1770 train_time:28697ms step_avg:94.40ms | |
| step:305/1770 train_time:28793ms step_avg:94.40ms | |
| step:306/1770 train_time:28888ms step_avg:94.41ms | |
| step:307/1770 train_time:28985ms step_avg:94.41ms | |
| step:308/1770 train_time:29082ms step_avg:94.42ms | |
| step:309/1770 train_time:29180ms step_avg:94.43ms | |
| step:310/1770 train_time:29277ms step_avg:94.44ms | |
| step:311/1770 train_time:29372ms step_avg:94.44ms | |
| step:312/1770 train_time:29468ms step_avg:94.45ms | |
| step:313/1770 train_time:29564ms step_avg:94.45ms | |
| step:314/1770 train_time:29661ms step_avg:94.46ms | |
| step:315/1770 train_time:29757ms step_avg:94.47ms | |
| step:316/1770 train_time:29853ms step_avg:94.47ms | |
| step:317/1770 train_time:29949ms step_avg:94.48ms | |
| step:318/1770 train_time:30045ms step_avg:94.48ms | |
| step:319/1770 train_time:30142ms step_avg:94.49ms | |
| step:320/1770 train_time:30238ms step_avg:94.49ms | |
| step:321/1770 train_time:30333ms step_avg:94.50ms | |
| step:322/1770 train_time:30429ms step_avg:94.50ms | |
| step:323/1770 train_time:30525ms step_avg:94.51ms | |
| step:324/1770 train_time:30622ms step_avg:94.51ms | |
| step:325/1770 train_time:30718ms step_avg:94.52ms | |
| step:326/1770 train_time:30813ms step_avg:94.52ms | |
| step:327/1770 train_time:30909ms step_avg:94.52ms | |
| step:328/1770 train_time:31006ms step_avg:94.53ms | |
| step:329/1770 train_time:31103ms step_avg:94.54ms | |
| step:330/1770 train_time:31200ms step_avg:94.54ms | |
| step:331/1770 train_time:31296ms step_avg:94.55ms | |
| step:332/1770 train_time:31391ms step_avg:94.55ms | |
| step:333/1770 train_time:31487ms step_avg:94.55ms | |
| step:334/1770 train_time:31583ms step_avg:94.56ms | |
| step:335/1770 train_time:31679ms step_avg:94.56ms | |
| step:336/1770 train_time:31774ms step_avg:94.57ms | |
| step:337/1770 train_time:31870ms step_avg:94.57ms | |
| step:338/1770 train_time:31966ms step_avg:94.57ms | |
| step:339/1770 train_time:32063ms step_avg:94.58ms | |
| step:340/1770 train_time:32159ms step_avg:94.59ms | |
| step:341/1770 train_time:32255ms step_avg:94.59ms | |
| step:342/1770 train_time:32351ms step_avg:94.59ms | |
| step:343/1770 train_time:32447ms step_avg:94.60ms | |
| step:344/1770 train_time:32544ms step_avg:94.61ms | |
| step:345/1770 train_time:32641ms step_avg:94.61ms | |
| step:346/1770 train_time:32738ms step_avg:94.62ms | |
| step:347/1770 train_time:32833ms step_avg:94.62ms | |
| step:348/1770 train_time:32930ms step_avg:94.63ms | |
| step:349/1770 train_time:33026ms step_avg:94.63ms | |
| step:350/1770 train_time:33124ms step_avg:94.64ms | |
| step:351/1770 train_time:33221ms step_avg:94.65ms | |
| step:352/1770 train_time:33318ms step_avg:94.65ms | |
| step:353/1770 train_time:33414ms step_avg:94.66ms | |
| step:354/1770 train_time:33509ms step_avg:94.66ms | |
| step:355/1770 train_time:33606ms step_avg:94.66ms | |
| step:356/1770 train_time:33703ms step_avg:94.67ms | |
| step:357/1770 train_time:33799ms step_avg:94.68ms | |
| step:358/1770 train_time:33895ms step_avg:94.68ms | |
| step:359/1770 train_time:33990ms step_avg:94.68ms | |
| step:360/1770 train_time:34086ms step_avg:94.68ms | |
| step:361/1770 train_time:34183ms step_avg:94.69ms | |
| step:362/1770 train_time:34279ms step_avg:94.69ms | |
| step:363/1770 train_time:34376ms step_avg:94.70ms | |
| step:364/1770 train_time:34471ms step_avg:94.70ms | |
| step:365/1770 train_time:34567ms step_avg:94.70ms | |
| step:366/1770 train_time:34664ms step_avg:94.71ms | |
| step:367/1770 train_time:34761ms step_avg:94.72ms | |
| step:368/1770 train_time:34857ms step_avg:94.72ms | |
| step:369/1770 train_time:34953ms step_avg:94.72ms | |
| step:370/1770 train_time:35048ms step_avg:94.73ms | |
| step:371/1770 train_time:35145ms step_avg:94.73ms | |
| step:372/1770 train_time:35242ms step_avg:94.74ms | |
| step:373/1770 train_time:35338ms step_avg:94.74ms | |
| step:374/1770 train_time:35435ms step_avg:94.74ms | |
| step:375/1770 train_time:35529ms step_avg:94.75ms | |
| step:375/1770 val_loss:3.8967 train_time:35806ms step_avg:95.48ms | |
| step:376/1770 train_time:35817ms step_avg:95.26ms | |
| step:377/1770 train_time:35826ms step_avg:95.03ms | |
| step:378/1770 train_time:35835ms step_avg:94.80ms | |
| step:379/1770 train_time:35919ms step_avg:94.77ms | |
| step:380/1770 train_time:36018ms step_avg:94.78ms | |
| step:381/1770 train_time:36113ms step_avg:94.78ms | |
| step:382/1770 train_time:36208ms step_avg:94.79ms | |
| step:383/1770 train_time:36304ms step_avg:94.79ms | |
| step:384/1770 train_time:36399ms step_avg:94.79ms | |
| step:385/1770 train_time:36494ms step_avg:94.79ms | |
| step:386/1770 train_time:36589ms step_avg:94.79ms | |
| step:387/1770 train_time:36685ms step_avg:94.79ms | |
| step:388/1770 train_time:36784ms step_avg:94.80ms | |
| step:389/1770 train_time:36883ms step_avg:94.81ms | |
| step:390/1770 train_time:36981ms step_avg:94.82ms | |
| step:391/1770 train_time:37078ms step_avg:94.83ms | |
| step:392/1770 train_time:37173ms step_avg:94.83ms | |
| step:393/1770 train_time:37269ms step_avg:94.83ms | |
| step:394/1770 train_time:37365ms step_avg:94.84ms | |
| step:395/1770 train_time:37461ms step_avg:94.84ms | |
| step:396/1770 train_time:37556ms step_avg:94.84ms | |
| step:397/1770 train_time:37652ms step_avg:94.84ms | |
| step:398/1770 train_time:37750ms step_avg:94.85ms | |
| step:399/1770 train_time:37850ms step_avg:94.86ms | |
| step:400/1770 train_time:37949ms step_avg:94.87ms | |
| step:401/1770 train_time:38048ms step_avg:94.88ms | |
| step:402/1770 train_time:38147ms step_avg:94.89ms | |
| step:403/1770 train_time:38247ms step_avg:94.91ms | |
| step:404/1770 train_time:38345ms step_avg:94.91ms | |
| step:405/1770 train_time:38444ms step_avg:94.92ms | |
| step:406/1770 train_time:38542ms step_avg:94.93ms | |
| step:407/1770 train_time:38641ms step_avg:94.94ms | |
| step:408/1770 train_time:38739ms step_avg:94.95ms | |
| step:409/1770 train_time:38838ms step_avg:94.96ms | |
| step:410/1770 train_time:38936ms step_avg:94.96ms | |
| step:411/1770 train_time:39034ms step_avg:94.97ms | |
| step:412/1770 train_time:39132ms step_avg:94.98ms | |
| step:413/1770 train_time:39229ms step_avg:94.99ms | |
| step:414/1770 train_time:39326ms step_avg:94.99ms | |
| step:415/1770 train_time:39425ms step_avg:95.00ms | |
| step:416/1770 train_time:39523ms step_avg:95.01ms | |
| step:417/1770 train_time:39621ms step_avg:95.01ms | |
| step:418/1770 train_time:39719ms step_avg:95.02ms | |
| step:419/1770 train_time:39818ms step_avg:95.03ms | |
| step:420/1770 train_time:39916ms step_avg:95.04ms | |
| step:421/1770 train_time:40015ms step_avg:95.05ms | |
| step:422/1770 train_time:40114ms step_avg:95.06ms | |
| step:423/1770 train_time:40212ms step_avg:95.06ms | |
| step:424/1770 train_time:40310ms step_avg:95.07ms | |
| step:425/1770 train_time:40408ms step_avg:95.08ms | |
| step:426/1770 train_time:40507ms step_avg:95.09ms | |
| step:427/1770 train_time:40605ms step_avg:95.09ms | |
| step:428/1770 train_time:40704ms step_avg:95.10ms | |
| step:429/1770 train_time:40803ms step_avg:95.11ms | |
| step:430/1770 train_time:40901ms step_avg:95.12ms | |
| step:431/1770 train_time:41000ms step_avg:95.13ms | |
| step:432/1770 train_time:41099ms step_avg:95.14ms | |
| step:433/1770 train_time:41196ms step_avg:95.14ms | |
| step:434/1770 train_time:41294ms step_avg:95.15ms | |
| step:435/1770 train_time:41391ms step_avg:95.15ms | |
| step:436/1770 train_time:41489ms step_avg:95.16ms | |
| step:437/1770 train_time:41588ms step_avg:95.17ms | |
| step:438/1770 train_time:41687ms step_avg:95.18ms | |
| step:439/1770 train_time:41786ms step_avg:95.18ms | |
| step:440/1770 train_time:41885ms step_avg:95.19ms | |
| step:441/1770 train_time:41985ms step_avg:95.20ms | |
| step:442/1770 train_time:42084ms step_avg:95.21ms | |
| step:443/1770 train_time:42183ms step_avg:95.22ms | |
| step:444/1770 train_time:42283ms step_avg:95.23ms | |
| step:445/1770 train_time:42383ms step_avg:95.24ms | |
| step:446/1770 train_time:42482ms step_avg:95.25ms | |
| step:447/1770 train_time:42579ms step_avg:95.26ms | |
| step:448/1770 train_time:42677ms step_avg:95.26ms | |
| step:449/1770 train_time:42774ms step_avg:95.27ms | |
| step:450/1770 train_time:42872ms step_avg:95.27ms | |
| step:451/1770 train_time:42970ms step_avg:95.28ms | |
| step:452/1770 train_time:43069ms step_avg:95.28ms | |
| step:453/1770 train_time:43167ms step_avg:95.29ms | |
| step:454/1770 train_time:43267ms step_avg:95.30ms | |
| step:455/1770 train_time:43367ms step_avg:95.31ms | |
| step:456/1770 train_time:43466ms step_avg:95.32ms | |
| step:457/1770 train_time:43565ms step_avg:95.33ms | |
| step:458/1770 train_time:43663ms step_avg:95.33ms | |
| step:459/1770 train_time:43762ms step_avg:95.34ms | |
| step:460/1770 train_time:43861ms step_avg:95.35ms | |
| step:461/1770 train_time:43959ms step_avg:95.36ms | |
| step:462/1770 train_time:44058ms step_avg:95.36ms | |
| step:463/1770 train_time:44157ms step_avg:95.37ms | |
| step:464/1770 train_time:44255ms step_avg:95.38ms | |
| step:465/1770 train_time:44353ms step_avg:95.38ms | |
| step:466/1770 train_time:44451ms step_avg:95.39ms | |
| step:467/1770 train_time:44549ms step_avg:95.39ms | |
| step:468/1770 train_time:44648ms step_avg:95.40ms | |
| step:469/1770 train_time:44745ms step_avg:95.41ms | |
| step:470/1770 train_time:44844ms step_avg:95.41ms | |
| step:471/1770 train_time:44943ms step_avg:95.42ms | |
| step:472/1770 train_time:45042ms step_avg:95.43ms | |
| step:473/1770 train_time:45140ms step_avg:95.43ms | |
| step:474/1770 train_time:45238ms step_avg:95.44ms | |
| step:475/1770 train_time:45338ms step_avg:95.45ms | |
| step:476/1770 train_time:45437ms step_avg:95.46ms | |
| step:477/1770 train_time:45535ms step_avg:95.46ms | |
| step:478/1770 train_time:45632ms step_avg:95.46ms | |
| step:479/1770 train_time:45729ms step_avg:95.47ms | |
| step:480/1770 train_time:45829ms step_avg:95.48ms | |
| step:481/1770 train_time:45928ms step_avg:95.48ms | |
| step:482/1770 train_time:46027ms step_avg:95.49ms | |
| step:483/1770 train_time:46127ms step_avg:95.50ms | |
| step:484/1770 train_time:46226ms step_avg:95.51ms | |
| step:485/1770 train_time:46326ms step_avg:95.52ms | |
| step:486/1770 train_time:46424ms step_avg:95.52ms | |
| step:487/1770 train_time:46523ms step_avg:95.53ms | |
| step:488/1770 train_time:46621ms step_avg:95.54ms | |
| step:489/1770 train_time:46719ms step_avg:95.54ms | |
| step:490/1770 train_time:46817ms step_avg:95.55ms | |
| step:491/1770 train_time:46915ms step_avg:95.55ms | |
| step:492/1770 train_time:47013ms step_avg:95.55ms | |
| step:493/1770 train_time:47111ms step_avg:95.56ms | |
| step:494/1770 train_time:47210ms step_avg:95.57ms | |
| step:495/1770 train_time:47308ms step_avg:95.57ms | |
| step:496/1770 train_time:47407ms step_avg:95.58ms | |
| step:497/1770 train_time:47505ms step_avg:95.58ms | |
| step:498/1770 train_time:47604ms step_avg:95.59ms | |
| step:499/1770 train_time:47703ms step_avg:95.60ms | |
| step:500/1770 train_time:47801ms step_avg:95.60ms | |
| step:500/1770 val_loss:3.7501 train_time:48083ms step_avg:96.17ms | |
| step:501/1770 train_time:48093ms step_avg:95.99ms | |
| step:502/1770 train_time:48102ms step_avg:95.82ms | |
| step:503/1770 train_time:48111ms step_avg:95.65ms | |
| step:504/1770 train_time:48199ms step_avg:95.63ms | |
| step:505/1770 train_time:48296ms step_avg:95.64ms | |
| step:506/1770 train_time:48393ms step_avg:95.64ms | |
| step:507/1770 train_time:48492ms step_avg:95.64ms | |
| step:508/1770 train_time:48591ms step_avg:95.65ms | |
| step:509/1770 train_time:48689ms step_avg:95.66ms | |
| step:510/1770 train_time:48786ms step_avg:95.66ms | |
| step:511/1770 train_time:48884ms step_avg:95.66ms | |
| step:512/1770 train_time:48982ms step_avg:95.67ms | |
| step:513/1770 train_time:49083ms step_avg:95.68ms | |
| step:514/1770 train_time:49183ms step_avg:95.69ms | |
| step:515/1770 train_time:49281ms step_avg:95.69ms | |
| step:516/1770 train_time:49379ms step_avg:95.70ms | |
| step:517/1770 train_time:49476ms step_avg:95.70ms | |
| step:518/1770 train_time:49574ms step_avg:95.70ms | |
| step:519/1770 train_time:49672ms step_avg:95.71ms | |
| step:520/1770 train_time:49769ms step_avg:95.71ms | |
| step:521/1770 train_time:49867ms step_avg:95.71ms | |
| step:522/1770 train_time:49965ms step_avg:95.72ms | |
| step:523/1770 train_time:50064ms step_avg:95.72ms | |
| step:524/1770 train_time:50162ms step_avg:95.73ms | |
| step:525/1770 train_time:50261ms step_avg:95.73ms | |
| step:526/1770 train_time:50359ms step_avg:95.74ms | |
| step:527/1770 train_time:50456ms step_avg:95.74ms | |
| step:528/1770 train_time:50553ms step_avg:95.74ms | |
| step:529/1770 train_time:50651ms step_avg:95.75ms | |
| step:530/1770 train_time:50749ms step_avg:95.75ms | |
| step:531/1770 train_time:50846ms step_avg:95.76ms | |
| step:532/1770 train_time:50945ms step_avg:95.76ms | |
| step:533/1770 train_time:51044ms step_avg:95.77ms | |
| step:534/1770 train_time:51142ms step_avg:95.77ms | |
| step:535/1770 train_time:51242ms step_avg:95.78ms | |
| step:536/1770 train_time:51342ms step_avg:95.79ms | |
| step:537/1770 train_time:51440ms step_avg:95.79ms | |
| step:538/1770 train_time:51538ms step_avg:95.80ms | |
| step:539/1770 train_time:51635ms step_avg:95.80ms | |
| step:540/1770 train_time:51732ms step_avg:95.80ms | |
| step:541/1770 train_time:51830ms step_avg:95.80ms | |
| step:542/1770 train_time:51930ms step_avg:95.81ms | |
| step:543/1770 train_time:52030ms step_avg:95.82ms | |
| step:544/1770 train_time:52130ms step_avg:95.83ms | |
| step:545/1770 train_time:52231ms step_avg:95.84ms | |
| step:546/1770 train_time:52332ms step_avg:95.85ms | |
| step:547/1770 train_time:52432ms step_avg:95.85ms | |
| step:548/1770 train_time:52532ms step_avg:95.86ms | |
| step:549/1770 train_time:52631ms step_avg:95.87ms | |
| step:550/1770 train_time:52728ms step_avg:95.87ms | |
| step:551/1770 train_time:52825ms step_avg:95.87ms | |
| step:552/1770 train_time:52924ms step_avg:95.88ms | |
| step:553/1770 train_time:53021ms step_avg:95.88ms | |
| step:554/1770 train_time:53120ms step_avg:95.88ms | |
| step:555/1770 train_time:53218ms step_avg:95.89ms | |
| step:556/1770 train_time:53317ms step_avg:95.89ms | |
| step:557/1770 train_time:53414ms step_avg:95.90ms | |
| step:558/1770 train_time:53512ms step_avg:95.90ms | |
| step:559/1770 train_time:53612ms step_avg:95.91ms | |
| step:560/1770 train_time:53712ms step_avg:95.91ms | |
| step:561/1770 train_time:53811ms step_avg:95.92ms | |
| step:562/1770 train_time:53909ms step_avg:95.92ms | |
| step:563/1770 train_time:54009ms step_avg:95.93ms | |
| step:564/1770 train_time:54108ms step_avg:95.94ms | |
| step:565/1770 train_time:54208ms step_avg:95.94ms | |
| step:566/1770 train_time:54307ms step_avg:95.95ms | |
| step:567/1770 train_time:54405ms step_avg:95.95ms | |
| step:568/1770 train_time:54505ms step_avg:95.96ms | |
| step:569/1770 train_time:54603ms step_avg:95.96ms | |
| step:570/1770 train_time:54701ms step_avg:95.97ms | |
| step:571/1770 train_time:54798ms step_avg:95.97ms | |
| step:572/1770 train_time:54896ms step_avg:95.97ms | |
| step:573/1770 train_time:54994ms step_avg:95.98ms | |
| step:574/1770 train_time:55093ms step_avg:95.98ms | |
| step:575/1770 train_time:55193ms step_avg:95.99ms | |
| step:576/1770 train_time:55292ms step_avg:95.99ms | |
| step:577/1770 train_time:55393ms step_avg:96.00ms | |
| step:578/1770 train_time:55493ms step_avg:96.01ms | |
| step:579/1770 train_time:55592ms step_avg:96.01ms | |
| step:580/1770 train_time:55690ms step_avg:96.02ms | |
| step:581/1770 train_time:55789ms step_avg:96.02ms | |
| step:582/1770 train_time:55887ms step_avg:96.02ms | |
| step:583/1770 train_time:55985ms step_avg:96.03ms | |
| step:584/1770 train_time:56083ms step_avg:96.03ms | |
| step:585/1770 train_time:56182ms step_avg:96.04ms | |
| step:586/1770 train_time:56280ms step_avg:96.04ms | |
| step:587/1770 train_time:56379ms step_avg:96.05ms | |
| step:588/1770 train_time:56477ms step_avg:96.05ms | |
| step:589/1770 train_time:56575ms step_avg:96.05ms | |
| step:590/1770 train_time:56674ms step_avg:96.06ms | |
| step:591/1770 train_time:56773ms step_avg:96.06ms | |
| step:592/1770 train_time:56872ms step_avg:96.07ms | |
| step:593/1770 train_time:56973ms step_avg:96.08ms | |
| step:594/1770 train_time:57072ms step_avg:96.08ms | |
| step:595/1770 train_time:57170ms step_avg:96.08ms | |
| step:596/1770 train_time:57269ms step_avg:96.09ms | |
| step:597/1770 train_time:57368ms step_avg:96.09ms | |
| step:598/1770 train_time:57468ms step_avg:96.10ms | |
| step:599/1770 train_time:57567ms step_avg:96.11ms | |
| step:600/1770 train_time:57666ms step_avg:96.11ms | |
| step:601/1770 train_time:57765ms step_avg:96.12ms | |
| step:602/1770 train_time:57863ms step_avg:96.12ms | |
| step:603/1770 train_time:57961ms step_avg:96.12ms | |
| step:604/1770 train_time:58059ms step_avg:96.12ms | |
| step:605/1770 train_time:58157ms step_avg:96.13ms | |
| step:606/1770 train_time:58255ms step_avg:96.13ms | |
| step:607/1770 train_time:58355ms step_avg:96.14ms | |
| step:608/1770 train_time:58454ms step_avg:96.14ms | |
| step:609/1770 train_time:58554ms step_avg:96.15ms | |
| step:610/1770 train_time:58652ms step_avg:96.15ms | |
| step:611/1770 train_time:58752ms step_avg:96.16ms | |
| step:612/1770 train_time:58852ms step_avg:96.16ms | |
| step:613/1770 train_time:58951ms step_avg:96.17ms | |
| step:614/1770 train_time:59052ms step_avg:96.18ms | |
| step:615/1770 train_time:59151ms step_avg:96.18ms | |
| step:616/1770 train_time:59252ms step_avg:96.19ms | |
| step:617/1770 train_time:59351ms step_avg:96.19ms | |
| step:618/1770 train_time:59450ms step_avg:96.20ms | |
| step:619/1770 train_time:59549ms step_avg:96.20ms | |
| step:620/1770 train_time:59649ms step_avg:96.21ms | |
| step:621/1770 train_time:59749ms step_avg:96.21ms | |
| step:622/1770 train_time:59849ms step_avg:96.22ms | |
| step:623/1770 train_time:59947ms step_avg:96.22ms | |
| step:624/1770 train_time:60046ms step_avg:96.23ms | |
| step:625/1770 train_time:60144ms step_avg:96.23ms | |
| step:625/1770 val_loss:3.6622 train_time:60426ms step_avg:96.68ms | |
| step:626/1770 train_time:60436ms step_avg:96.54ms | |
| step:627/1770 train_time:60444ms step_avg:96.40ms | |
| step:628/1770 train_time:60452ms step_avg:96.26ms | |
| step:629/1770 train_time:60545ms step_avg:96.26ms | |
| step:630/1770 train_time:60643ms step_avg:96.26ms | |
| step:631/1770 train_time:60742ms step_avg:96.26ms | |
| step:632/1770 train_time:60839ms step_avg:96.26ms | |
| step:633/1770 train_time:60937ms step_avg:96.27ms | |
| step:634/1770 train_time:61035ms step_avg:96.27ms | |
| step:635/1770 train_time:61133ms step_avg:96.27ms | |
| step:636/1770 train_time:61230ms step_avg:96.27ms | |
| step:637/1770 train_time:61329ms step_avg:96.28ms | |
| step:638/1770 train_time:61431ms step_avg:96.29ms | |
| step:639/1770 train_time:61533ms step_avg:96.30ms | |
| step:640/1770 train_time:61633ms step_avg:96.30ms | |
| step:641/1770 train_time:61733ms step_avg:96.31ms | |
| step:642/1770 train_time:61831ms step_avg:96.31ms | |
| step:643/1770 train_time:61929ms step_avg:96.31ms | |
| step:644/1770 train_time:62027ms step_avg:96.32ms | |
| step:645/1770 train_time:62125ms step_avg:96.32ms | |
| step:646/1770 train_time:62222ms step_avg:96.32ms | |
| step:647/1770 train_time:62320ms step_avg:96.32ms | |
| step:648/1770 train_time:62420ms step_avg:96.33ms | |
| step:649/1770 train_time:62520ms step_avg:96.33ms | |
| step:650/1770 train_time:62620ms step_avg:96.34ms | |
| step:651/1770 train_time:62719ms step_avg:96.34ms | |
| step:652/1770 train_time:62818ms step_avg:96.35ms | |
| step:653/1770 train_time:62915ms step_avg:96.35ms | |
| step:654/1770 train_time:63014ms step_avg:96.35ms | |
| step:655/1770 train_time:63114ms step_avg:96.36ms | |
| step:656/1770 train_time:63213ms step_avg:96.36ms | |
| step:657/1770 train_time:63313ms step_avg:96.37ms | |
| step:658/1770 train_time:63411ms step_avg:96.37ms | |
| step:659/1770 train_time:63511ms step_avg:96.38ms | |
| step:660/1770 train_time:63613ms step_avg:96.38ms | |
| step:661/1770 train_time:63715ms step_avg:96.39ms | |
| step:662/1770 train_time:63816ms step_avg:96.40ms | |
| step:663/1770 train_time:63916ms step_avg:96.40ms | |
| step:664/1770 train_time:64016ms step_avg:96.41ms | |
| step:665/1770 train_time:64117ms step_avg:96.42ms | |
| step:666/1770 train_time:64217ms step_avg:96.42ms | |
| step:667/1770 train_time:64318ms step_avg:96.43ms | |
| step:668/1770 train_time:64418ms step_avg:96.43ms | |
| step:669/1770 train_time:64519ms step_avg:96.44ms | |
| step:670/1770 train_time:64620ms step_avg:96.45ms | |
| step:671/1770 train_time:64720ms step_avg:96.45ms | |
| step:672/1770 train_time:64821ms step_avg:96.46ms | |
| step:673/1770 train_time:64921ms step_avg:96.46ms | |
| step:674/1770 train_time:65020ms step_avg:96.47ms | |
| step:675/1770 train_time:65120ms step_avg:96.47ms | |
| step:676/1770 train_time:65220ms step_avg:96.48ms | |
| step:677/1770 train_time:65320ms step_avg:96.48ms | |
| step:678/1770 train_time:65421ms step_avg:96.49ms | |
| step:679/1770 train_time:65521ms step_avg:96.50ms | |
| step:680/1770 train_time:65621ms step_avg:96.50ms | |
| step:681/1770 train_time:65721ms step_avg:96.51ms | |
| step:682/1770 train_time:65821ms step_avg:96.51ms | |
| step:683/1770 train_time:65920ms step_avg:96.52ms | |
| step:684/1770 train_time:66020ms step_avg:96.52ms | |
| step:685/1770 train_time:66121ms step_avg:96.53ms | |
| step:686/1770 train_time:66220ms step_avg:96.53ms | |
| step:687/1770 train_time:66320ms step_avg:96.54ms | |
| step:688/1770 train_time:66420ms step_avg:96.54ms | |
| step:689/1770 train_time:66521ms step_avg:96.55ms | |
| step:690/1770 train_time:66621ms step_avg:96.55ms | |
| step:691/1770 train_time:66721ms step_avg:96.56ms | |
| step:692/1770 train_time:66821ms step_avg:96.56ms | |
| step:693/1770 train_time:66920ms step_avg:96.57ms | |
| step:694/1770 train_time:67020ms step_avg:96.57ms | |
| step:695/1770 train_time:67120ms step_avg:96.58ms | |
| step:696/1770 train_time:67222ms step_avg:96.58ms | |
| step:697/1770 train_time:67321ms step_avg:96.59ms | |
| step:698/1770 train_time:67421ms step_avg:96.59ms | |
| step:699/1770 train_time:67521ms step_avg:96.60ms | |
| step:700/1770 train_time:67621ms step_avg:96.60ms | |
| step:701/1770 train_time:67721ms step_avg:96.61ms | |
| step:702/1770 train_time:67820ms step_avg:96.61ms | |
| step:703/1770 train_time:67921ms step_avg:96.62ms | |
| step:704/1770 train_time:68020ms step_avg:96.62ms | |
| step:705/1770 train_time:68121ms step_avg:96.63ms | |
| step:706/1770 train_time:68221ms step_avg:96.63ms | |
| step:707/1770 train_time:68320ms step_avg:96.63ms | |
| step:708/1770 train_time:68420ms step_avg:96.64ms | |
| step:709/1770 train_time:68520ms step_avg:96.64ms | |
| step:710/1770 train_time:68621ms step_avg:96.65ms | |
| step:711/1770 train_time:68721ms step_avg:96.65ms | |
| step:712/1770 train_time:68821ms step_avg:96.66ms | |
| step:713/1770 train_time:68922ms step_avg:96.66ms | |
| step:714/1770 train_time:69021ms step_avg:96.67ms | |
| step:715/1770 train_time:69121ms step_avg:96.67ms | |
| step:716/1770 train_time:69221ms step_avg:96.68ms | |
| step:717/1770 train_time:69320ms step_avg:96.68ms | |
| step:718/1770 train_time:69421ms step_avg:96.69ms | |
| step:719/1770 train_time:69520ms step_avg:96.69ms | |
| step:720/1770 train_time:69620ms step_avg:96.69ms | |
| step:721/1770 train_time:69720ms step_avg:96.70ms | |
| step:722/1770 train_time:69820ms step_avg:96.70ms | |
| step:723/1770 train_time:69920ms step_avg:96.71ms | |
| step:724/1770 train_time:70021ms step_avg:96.71ms | |
| step:725/1770 train_time:70121ms step_avg:96.72ms | |
| step:726/1770 train_time:70221ms step_avg:96.72ms | |
| step:727/1770 train_time:70321ms step_avg:96.73ms | |
| step:728/1770 train_time:70420ms step_avg:96.73ms | |
| step:729/1770 train_time:70520ms step_avg:96.74ms | |
| step:730/1770 train_time:70621ms step_avg:96.74ms | |
| step:731/1770 train_time:70721ms step_avg:96.74ms | |
| step:732/1770 train_time:70821ms step_avg:96.75ms | |
| step:733/1770 train_time:70921ms step_avg:96.75ms | |
| step:734/1770 train_time:71021ms step_avg:96.76ms | |
| step:735/1770 train_time:71121ms step_avg:96.76ms | |
| step:736/1770 train_time:71221ms step_avg:96.77ms | |
| step:737/1770 train_time:71321ms step_avg:96.77ms | |
| step:738/1770 train_time:71420ms step_avg:96.78ms | |
| step:739/1770 train_time:71520ms step_avg:96.78ms | |
| step:740/1770 train_time:71620ms step_avg:96.78ms | |
| step:741/1770 train_time:71720ms step_avg:96.79ms | |
| step:742/1770 train_time:71820ms step_avg:96.79ms | |
| step:743/1770 train_time:71919ms step_avg:96.80ms | |
| step:744/1770 train_time:72020ms step_avg:96.80ms | |
| step:745/1770 train_time:72120ms step_avg:96.81ms | |
| step:746/1770 train_time:72221ms step_avg:96.81ms | |
| step:747/1770 train_time:72320ms step_avg:96.81ms | |
| step:748/1770 train_time:72420ms step_avg:96.82ms | |
| step:749/1770 train_time:72520ms step_avg:96.82ms | |
| step:750/1770 train_time:72620ms step_avg:96.83ms | |
| step:750/1770 val_loss:3.5996 train_time:72907ms step_avg:97.21ms | |
| step:751/1770 train_time:72916ms step_avg:97.09ms | |
| step:752/1770 train_time:72925ms step_avg:96.97ms | |
| step:753/1770 train_time:72934ms step_avg:96.86ms | |
| step:754/1770 train_time:73027ms step_avg:96.85ms | |
| step:755/1770 train_time:73126ms step_avg:96.86ms | |
| step:756/1770 train_time:73225ms step_avg:96.86ms | |
| step:757/1770 train_time:73325ms step_avg:96.86ms | |
| step:758/1770 train_time:73425ms step_avg:96.87ms | |
| step:759/1770 train_time:73525ms step_avg:96.87ms | |
| step:760/1770 train_time:73625ms step_avg:96.87ms | |
| step:761/1770 train_time:73724ms step_avg:96.88ms | |
| step:762/1770 train_time:73825ms step_avg:96.88ms | |
| step:763/1770 train_time:73928ms step_avg:96.89ms | |
| step:764/1770 train_time:74030ms step_avg:96.90ms | |
| step:765/1770 train_time:74130ms step_avg:96.90ms | |
| step:766/1770 train_time:74230ms step_avg:96.91ms | |
| step:767/1770 train_time:74330ms step_avg:96.91ms | |
| step:768/1770 train_time:74429ms step_avg:96.91ms | |
| step:769/1770 train_time:74528ms step_avg:96.92ms | |
| step:770/1770 train_time:74627ms step_avg:96.92ms | |
| step:771/1770 train_time:74726ms step_avg:96.92ms | |
| step:772/1770 train_time:74826ms step_avg:96.92ms | |
| step:773/1770 train_time:74928ms step_avg:96.93ms | |
| step:774/1770 train_time:75029ms step_avg:96.94ms | |
| step:775/1770 train_time:75129ms step_avg:96.94ms | |
| step:776/1770 train_time:75228ms step_avg:96.94ms | |
| step:777/1770 train_time:75328ms step_avg:96.95ms | |
| step:778/1770 train_time:75428ms step_avg:96.95ms | |
| step:779/1770 train_time:75527ms step_avg:96.95ms | |
| step:780/1770 train_time:75627ms step_avg:96.96ms | |
| step:781/1770 train_time:75726ms step_avg:96.96ms | |
| step:782/1770 train_time:75826ms step_avg:96.96ms | |
| step:783/1770 train_time:75927ms step_avg:96.97ms | |
| step:784/1770 train_time:76027ms step_avg:96.97ms | |
| step:785/1770 train_time:76128ms step_avg:96.98ms | |
| step:786/1770 train_time:76228ms step_avg:96.98ms | |
| step:787/1770 train_time:76328ms step_avg:96.99ms | |
| step:788/1770 train_time:76428ms step_avg:96.99ms | |
| step:789/1770 train_time:76527ms step_avg:96.99ms | |
| step:790/1770 train_time:76627ms step_avg:97.00ms | |
| step:791/1770 train_time:76727ms step_avg:97.00ms | |
| step:792/1770 train_time:76827ms step_avg:97.00ms | |
| step:793/1770 train_time:76928ms step_avg:97.01ms | |
| step:794/1770 train_time:77029ms step_avg:97.01ms | |
| step:795/1770 train_time:77129ms step_avg:97.02ms | |
| step:796/1770 train_time:77230ms step_avg:97.02ms | |
| step:797/1770 train_time:77330ms step_avg:97.03ms | |
| step:798/1770 train_time:77429ms step_avg:97.03ms | |
| step:799/1770 train_time:77529ms step_avg:97.03ms | |
| step:800/1770 train_time:77628ms step_avg:97.04ms | |
| step:801/1770 train_time:77728ms step_avg:97.04ms | |
| step:802/1770 train_time:77828ms step_avg:97.04ms | |
| step:803/1770 train_time:77929ms step_avg:97.05ms | |
| step:804/1770 train_time:78030ms step_avg:97.05ms | |
| step:805/1770 train_time:78131ms step_avg:97.06ms | |
| step:806/1770 train_time:78230ms step_avg:97.06ms | |
| step:807/1770 train_time:78331ms step_avg:97.06ms | |
| step:808/1770 train_time:78430ms step_avg:97.07ms | |
| step:809/1770 train_time:78530ms step_avg:97.07ms | |
| step:810/1770 train_time:78630ms step_avg:97.07ms | |
| step:811/1770 train_time:78729ms step_avg:97.08ms | |
| step:812/1770 train_time:78830ms step_avg:97.08ms | |
| step:813/1770 train_time:78929ms step_avg:97.08ms | |
| step:814/1770 train_time:79029ms step_avg:97.09ms | |
| step:815/1770 train_time:79129ms step_avg:97.09ms | |
| step:816/1770 train_time:79228ms step_avg:97.09ms | |
| step:817/1770 train_time:79329ms step_avg:97.10ms | |
| step:818/1770 train_time:79429ms step_avg:97.10ms | |
| step:819/1770 train_time:79529ms step_avg:97.11ms | |
| step:820/1770 train_time:79630ms step_avg:97.11ms | |
| step:821/1770 train_time:79730ms step_avg:97.11ms | |
| step:822/1770 train_time:79829ms step_avg:97.12ms | |
| step:823/1770 train_time:79928ms step_avg:97.12ms | |
| step:824/1770 train_time:80027ms step_avg:97.12ms | |
| step:825/1770 train_time:80127ms step_avg:97.12ms | |
| step:826/1770 train_time:80226ms step_avg:97.13ms | |
| step:827/1770 train_time:80327ms step_avg:97.13ms | |
| step:828/1770 train_time:80428ms step_avg:97.13ms | |
| step:829/1770 train_time:80528ms step_avg:97.14ms | |
| step:830/1770 train_time:80628ms step_avg:97.14ms | |
| step:831/1770 train_time:80729ms step_avg:97.15ms | |
| step:832/1770 train_time:80829ms step_avg:97.15ms | |
| step:833/1770 train_time:80928ms step_avg:97.15ms | |
| step:834/1770 train_time:81028ms step_avg:97.16ms | |
| step:835/1770 train_time:81129ms step_avg:97.16ms | |
| step:836/1770 train_time:81229ms step_avg:97.16ms | |
| step:837/1770 train_time:81328ms step_avg:97.17ms | |
| step:838/1770 train_time:81429ms step_avg:97.17ms | |
| step:839/1770 train_time:81530ms step_avg:97.17ms | |
| step:840/1770 train_time:81629ms step_avg:97.18ms | |
| step:841/1770 train_time:81730ms step_avg:97.18ms | |
| step:842/1770 train_time:81829ms step_avg:97.18ms | |
| step:843/1770 train_time:81930ms step_avg:97.19ms | |
| step:844/1770 train_time:82028ms step_avg:97.19ms | |
| step:845/1770 train_time:82128ms step_avg:97.19ms | |
| step:846/1770 train_time:82227ms step_avg:97.20ms | |
| step:847/1770 train_time:82327ms step_avg:97.20ms | |
| step:848/1770 train_time:82428ms step_avg:97.20ms | |
| step:849/1770 train_time:82528ms step_avg:97.21ms | |
| step:850/1770 train_time:82629ms step_avg:97.21ms | |
| step:851/1770 train_time:82729ms step_avg:97.21ms | |
| step:852/1770 train_time:82829ms step_avg:97.22ms | |
| step:853/1770 train_time:82929ms step_avg:97.22ms | |
| step:854/1770 train_time:83029ms step_avg:97.22ms | |
| step:855/1770 train_time:83128ms step_avg:97.23ms | |
| step:856/1770 train_time:83228ms step_avg:97.23ms | |
| step:857/1770 train_time:83328ms step_avg:97.23ms | |
| step:858/1770 train_time:83428ms step_avg:97.24ms | |
| step:859/1770 train_time:83528ms step_avg:97.24ms | |
| step:860/1770 train_time:83628ms step_avg:97.24ms | |
| step:861/1770 train_time:83728ms step_avg:97.25ms | |
| step:862/1770 train_time:83828ms step_avg:97.25ms | |
| step:863/1770 train_time:83928ms step_avg:97.25ms | |
| step:864/1770 train_time:84028ms step_avg:97.25ms | |
| step:865/1770 train_time:84129ms step_avg:97.26ms | |
| step:866/1770 train_time:84228ms step_avg:97.26ms | |
| step:867/1770 train_time:84328ms step_avg:97.26ms | |
| step:868/1770 train_time:84428ms step_avg:97.27ms | |
| step:869/1770 train_time:84529ms step_avg:97.27ms | |
| step:870/1770 train_time:84629ms step_avg:97.27ms | |
| step:871/1770 train_time:84729ms step_avg:97.28ms | |
| step:872/1770 train_time:84828ms step_avg:97.28ms | |
| step:873/1770 train_time:84929ms step_avg:97.28ms | |
| step:874/1770 train_time:85028ms step_avg:97.29ms | |
| step:875/1770 train_time:85128ms step_avg:97.29ms | |
| step:875/1770 val_loss:3.5489 train_time:85415ms step_avg:97.62ms | |
| step:876/1770 train_time:85425ms step_avg:97.52ms | |
| step:877/1770 train_time:85433ms step_avg:97.41ms | |
| step:878/1770 train_time:85441ms step_avg:97.31ms | |
| step:879/1770 train_time:85532ms step_avg:97.31ms | |
| step:880/1770 train_time:85633ms step_avg:97.31ms | |
| step:881/1770 train_time:85732ms step_avg:97.31ms | |
| step:882/1770 train_time:85831ms step_avg:97.31ms | |
| step:883/1770 train_time:85930ms step_avg:97.32ms | |
| step:884/1770 train_time:86029ms step_avg:97.32ms | |
| step:885/1770 train_time:86129ms step_avg:97.32ms | |
| step:886/1770 train_time:86229ms step_avg:97.32ms | |
| step:887/1770 train_time:86331ms step_avg:97.33ms | |
| step:888/1770 train_time:86435ms step_avg:97.34ms | |
| step:889/1770 train_time:86536ms step_avg:97.34ms | |
| step:890/1770 train_time:86636ms step_avg:97.34ms | |
| step:891/1770 train_time:86736ms step_avg:97.35ms | |
| step:892/1770 train_time:86836ms step_avg:97.35ms | |
| step:893/1770 train_time:86935ms step_avg:97.35ms | |
| step:894/1770 train_time:87034ms step_avg:97.35ms | |
| step:895/1770 train_time:87133ms step_avg:97.36ms | |
| step:896/1770 train_time:87232ms step_avg:97.36ms | |
| step:897/1770 train_time:87333ms step_avg:97.36ms | |
| step:898/1770 train_time:87434ms step_avg:97.36ms | |
| step:899/1770 train_time:87535ms step_avg:97.37ms | |
| step:900/1770 train_time:87636ms step_avg:97.37ms | |
| step:901/1770 train_time:87736ms step_avg:97.38ms | |
| step:902/1770 train_time:87837ms step_avg:97.38ms | |
| step:903/1770 train_time:87936ms step_avg:97.38ms | |
| step:904/1770 train_time:88035ms step_avg:97.38ms | |
| step:905/1770 train_time:88135ms step_avg:97.39ms | |
| step:906/1770 train_time:88236ms step_avg:97.39ms | |
| step:907/1770 train_time:88337ms step_avg:97.39ms | |
| step:908/1770 train_time:88437ms step_avg:97.40ms | |
| step:909/1770 train_time:88538ms step_avg:97.40ms | |
| step:910/1770 train_time:88638ms step_avg:97.40ms | |
| step:911/1770 train_time:88738ms step_avg:97.41ms | |
| step:912/1770 train_time:88838ms step_avg:97.41ms | |
| step:913/1770 train_time:88938ms step_avg:97.41ms | |
| step:914/1770 train_time:89039ms step_avg:97.42ms | |
| step:915/1770 train_time:89138ms step_avg:97.42ms | |
| step:916/1770 train_time:89240ms step_avg:97.42ms | |
| step:917/1770 train_time:89341ms step_avg:97.43ms | |
| step:918/1770 train_time:89441ms step_avg:97.43ms | |
| step:919/1770 train_time:89542ms step_avg:97.43ms | |
| step:920/1770 train_time:89643ms step_avg:97.44ms | |
| step:921/1770 train_time:89744ms step_avg:97.44ms | |
| step:922/1770 train_time:89846ms step_avg:97.45ms | |
| step:923/1770 train_time:89949ms step_avg:97.45ms | |
| step:924/1770 train_time:90051ms step_avg:97.46ms | |
| step:925/1770 train_time:90153ms step_avg:97.46ms | |
| step:926/1770 train_time:90253ms step_avg:97.47ms | |
| step:927/1770 train_time:90354ms step_avg:97.47ms | |
| step:928/1770 train_time:90454ms step_avg:97.47ms | |
| step:929/1770 train_time:90555ms step_avg:97.48ms | |
| step:930/1770 train_time:90656ms step_avg:97.48ms | |
| step:931/1770 train_time:90757ms step_avg:97.48ms | |
| step:932/1770 train_time:90860ms step_avg:97.49ms | |
| step:933/1770 train_time:90961ms step_avg:97.49ms | |
| step:934/1770 train_time:91063ms step_avg:97.50ms | |
| step:935/1770 train_time:91164ms step_avg:97.50ms | |
| step:936/1770 train_time:91268ms step_avg:97.51ms | |
| step:937/1770 train_time:91371ms step_avg:97.51ms | |
| step:938/1770 train_time:91473ms step_avg:97.52ms | |
| step:939/1770 train_time:91574ms step_avg:97.52ms | |
| step:940/1770 train_time:91675ms step_avg:97.53ms | |
| step:941/1770 train_time:91776ms step_avg:97.53ms | |
| step:942/1770 train_time:91877ms step_avg:97.53ms | |
| step:943/1770 train_time:91978ms step_avg:97.54ms | |
| step:944/1770 train_time:92080ms step_avg:97.54ms | |
| step:945/1770 train_time:92182ms step_avg:97.55ms | |
| step:946/1770 train_time:92285ms step_avg:97.55ms | |
| step:947/1770 train_time:92387ms step_avg:97.56ms | |
| step:948/1770 train_time:92491ms step_avg:97.56ms | |
| step:949/1770 train_time:92593ms step_avg:97.57ms | |
| step:950/1770 train_time:92694ms step_avg:97.57ms | |
| step:951/1770 train_time:92794ms step_avg:97.58ms | |
| step:952/1770 train_time:92895ms step_avg:97.58ms | |
| step:953/1770 train_time:92995ms step_avg:97.58ms | |
| step:954/1770 train_time:93097ms step_avg:97.59ms | |
| step:955/1770 train_time:93199ms step_avg:97.59ms | |
| step:956/1770 train_time:93301ms step_avg:97.60ms | |
| step:957/1770 train_time:93405ms step_avg:97.60ms | |
| step:958/1770 train_time:93508ms step_avg:97.61ms | |
| step:959/1770 train_time:93611ms step_avg:97.61ms | |
| step:960/1770 train_time:93712ms step_avg:97.62ms | |
| step:961/1770 train_time:93814ms step_avg:97.62ms | |
| step:962/1770 train_time:93915ms step_avg:97.62ms | |
| step:963/1770 train_time:94015ms step_avg:97.63ms | |
| step:964/1770 train_time:94117ms step_avg:97.63ms | |
| step:965/1770 train_time:94218ms step_avg:97.63ms | |
| step:966/1770 train_time:94321ms step_avg:97.64ms | |
| step:967/1770 train_time:94425ms step_avg:97.65ms | |
| step:968/1770 train_time:94527ms step_avg:97.65ms | |
| step:969/1770 train_time:94631ms step_avg:97.66ms | |
| step:970/1770 train_time:94732ms step_avg:97.66ms | |
| step:971/1770 train_time:94833ms step_avg:97.67ms | |
| step:972/1770 train_time:94934ms step_avg:97.67ms | |
| step:973/1770 train_time:95035ms step_avg:97.67ms | |
| step:974/1770 train_time:95135ms step_avg:97.67ms | |
| step:975/1770 train_time:95237ms step_avg:97.68ms | |
| step:976/1770 train_time:95338ms step_avg:97.68ms | |
| step:977/1770 train_time:95440ms step_avg:97.69ms | |
| step:978/1770 train_time:95544ms step_avg:97.69ms | |
| step:979/1770 train_time:95646ms step_avg:97.70ms | |
| step:980/1770 train_time:95748ms step_avg:97.70ms | |
| step:981/1770 train_time:95851ms step_avg:97.71ms | |
| step:982/1770 train_time:95952ms step_avg:97.71ms | |
| step:983/1770 train_time:96054ms step_avg:97.72ms | |
| step:984/1770 train_time:96154ms step_avg:97.72ms | |
| step:985/1770 train_time:96255ms step_avg:97.72ms | |
| step:986/1770 train_time:96357ms step_avg:97.72ms | |
| step:987/1770 train_time:96458ms step_avg:97.73ms | |
| step:988/1770 train_time:96560ms step_avg:97.73ms | |
| step:989/1770 train_time:96662ms step_avg:97.74ms | |
| step:990/1770 train_time:96764ms step_avg:97.74ms | |
| step:991/1770 train_time:96868ms step_avg:97.75ms | |
| step:992/1770 train_time:96971ms step_avg:97.75ms | |
| step:993/1770 train_time:97073ms step_avg:97.76ms | |
| step:994/1770 train_time:97173ms step_avg:97.76ms | |
| step:995/1770 train_time:97274ms step_avg:97.76ms | |
| step:996/1770 train_time:97374ms step_avg:97.77ms | |
| step:997/1770 train_time:97476ms step_avg:97.77ms | |
| step:998/1770 train_time:97577ms step_avg:97.77ms | |
| step:999/1770 train_time:97679ms step_avg:97.78ms | |
| step:1000/1770 train_time:97781ms step_avg:97.78ms | |
| step:1000/1770 val_loss:3.5120 train_time:98079ms step_avg:98.08ms | |
| step:1001/1770 train_time:98088ms step_avg:97.99ms | |
| step:1002/1770 train_time:98097ms step_avg:97.90ms | |
| step:1003/1770 train_time:98105ms step_avg:97.81ms | |
| step:1004/1770 train_time:98198ms step_avg:97.81ms | |
| step:1005/1770 train_time:98301ms step_avg:97.81ms | |
| step:1006/1770 train_time:98405ms step_avg:97.82ms | |
| step:1007/1770 train_time:98507ms step_avg:97.82ms | |
| step:1008/1770 train_time:98608ms step_avg:97.83ms | |
| step:1009/1770 train_time:98709ms step_avg:97.83ms | |
| step:1010/1770 train_time:98810ms step_avg:97.83ms | |
| step:1011/1770 train_time:98910ms step_avg:97.83ms | |
| step:1012/1770 train_time:99011ms step_avg:97.84ms | |
| step:1013/1770 train_time:99114ms step_avg:97.84ms | |
| step:1014/1770 train_time:99216ms step_avg:97.85ms | |
| step:1015/1770 train_time:99318ms step_avg:97.85ms | |
| step:1016/1770 train_time:99420ms step_avg:97.85ms | |
| step:1017/1770 train_time:99522ms step_avg:97.86ms | |
| step:1018/1770 train_time:99626ms step_avg:97.86ms | |
| step:1019/1770 train_time:99727ms step_avg:97.87ms | |
| step:1020/1770 train_time:99828ms step_avg:97.87ms | |
| step:1021/1770 train_time:99929ms step_avg:97.87ms | |
| step:1022/1770 train_time:100030ms step_avg:97.88ms | |
| step:1023/1770 train_time:100131ms step_avg:97.88ms | |
| step:1024/1770 train_time:100233ms step_avg:97.88ms | |
| step:1025/1770 train_time:100334ms step_avg:97.89ms | |
| step:1026/1770 train_time:100435ms step_avg:97.89ms | |
| step:1027/1770 train_time:100538ms step_avg:97.89ms | |
| step:1028/1770 train_time:100640ms step_avg:97.90ms | |
| step:1029/1770 train_time:100742ms step_avg:97.90ms | |
| step:1030/1770 train_time:100844ms step_avg:97.91ms | |
| step:1031/1770 train_time:100948ms step_avg:97.91ms | |
| step:1032/1770 train_time:101049ms step_avg:97.92ms | |
| step:1033/1770 train_time:101150ms step_avg:97.92ms | |
| step:1034/1770 train_time:101251ms step_avg:97.92ms | |
| step:1035/1770 train_time:101352ms step_avg:97.93ms | |
| step:1036/1770 train_time:101453ms step_avg:97.93ms | |
| step:1037/1770 train_time:101555ms step_avg:97.93ms | |
| step:1038/1770 train_time:101656ms step_avg:97.93ms | |
| step:1039/1770 train_time:101758ms step_avg:97.94ms | |
| step:1040/1770 train_time:101862ms step_avg:97.94ms | |
| step:1041/1770 train_time:101965ms step_avg:97.95ms | |
| step:1042/1770 train_time:102067ms step_avg:97.95ms | |
| step:1043/1770 train_time:102169ms step_avg:97.96ms | |
| step:1044/1770 train_time:102270ms step_avg:97.96ms | |
| step:1045/1770 train_time:102371ms step_avg:97.96ms | |
| step:1046/1770 train_time:102472ms step_avg:97.97ms | |
| step:1047/1770 train_time:102572ms step_avg:97.97ms | |
| step:1048/1770 train_time:102673ms step_avg:97.97ms | |
| step:1049/1770 train_time:102775ms step_avg:97.97ms | |
| step:1050/1770 train_time:102877ms step_avg:97.98ms | |
| step:1051/1770 train_time:102979ms step_avg:97.98ms | |
| step:1052/1770 train_time:103082ms step_avg:97.99ms | |
| step:1053/1770 train_time:103186ms step_avg:97.99ms | |
| step:1054/1770 train_time:103288ms step_avg:98.00ms | |
| step:1055/1770 train_time:103390ms step_avg:98.00ms | |
| step:1056/1770 train_time:103490ms step_avg:98.00ms | |
| step:1057/1770 train_time:103591ms step_avg:98.00ms | |
| step:1058/1770 train_time:103691ms step_avg:98.01ms | |
| step:1059/1770 train_time:103792ms step_avg:98.01ms | |
| step:1060/1770 train_time:103894ms step_avg:98.01ms | |
| step:1061/1770 train_time:103996ms step_avg:98.02ms | |
| step:1062/1770 train_time:104098ms step_avg:98.02ms | |
| step:1063/1770 train_time:104202ms step_avg:98.03ms | |
| step:1064/1770 train_time:104306ms step_avg:98.03ms | |
| step:1065/1770 train_time:104409ms step_avg:98.04ms | |
| step:1066/1770 train_time:104510ms step_avg:98.04ms | |
| step:1067/1770 train_time:104612ms step_avg:98.04ms | |
| step:1068/1770 train_time:104712ms step_avg:98.04ms | |
| step:1069/1770 train_time:104812ms step_avg:98.05ms | |
| step:1070/1770 train_time:104914ms step_avg:98.05ms | |
| step:1071/1770 train_time:105015ms step_avg:98.05ms | |
| step:1072/1770 train_time:105117ms step_avg:98.06ms | |
| step:1073/1770 train_time:105219ms step_avg:98.06ms | |
| step:1074/1770 train_time:105323ms step_avg:98.07ms | |
| step:1075/1770 train_time:105426ms step_avg:98.07ms | |
| step:1076/1770 train_time:105528ms step_avg:98.07ms | |
| step:1077/1770 train_time:105629ms step_avg:98.08ms | |
| step:1078/1770 train_time:105731ms step_avg:98.08ms | |
| step:1079/1770 train_time:105832ms step_avg:98.08ms | |
| step:1080/1770 train_time:105932ms step_avg:98.09ms | |
| step:1081/1770 train_time:106033ms step_avg:98.09ms | |
| step:1082/1770 train_time:106136ms step_avg:98.09ms | |
| step:1083/1770 train_time:106238ms step_avg:98.10ms | |
| step:1084/1770 train_time:106340ms step_avg:98.10ms | |
| step:1085/1770 train_time:106444ms step_avg:98.10ms | |
| step:1086/1770 train_time:106546ms step_avg:98.11ms | |
| step:1087/1770 train_time:106649ms step_avg:98.11ms | |
| step:1088/1770 train_time:106751ms step_avg:98.12ms | |
| step:1089/1770 train_time:106852ms step_avg:98.12ms | |
| step:1090/1770 train_time:106952ms step_avg:98.12ms | |
| step:1091/1770 train_time:107053ms step_avg:98.12ms | |
| step:1092/1770 train_time:107155ms step_avg:98.13ms | |
| step:1093/1770 train_time:107257ms step_avg:98.13ms | |
| step:1094/1770 train_time:107360ms step_avg:98.14ms | |
| step:1095/1770 train_time:107462ms step_avg:98.14ms | |
| step:1096/1770 train_time:107565ms step_avg:98.14ms | |
| step:1097/1770 train_time:107669ms step_avg:98.15ms | |
| step:1098/1770 train_time:107770ms step_avg:98.15ms | |
| step:1099/1770 train_time:107873ms step_avg:98.16ms | |
| step:1100/1770 train_time:107975ms step_avg:98.16ms | |
| step:1101/1770 train_time:108075ms step_avg:98.16ms | |
| step:1102/1770 train_time:108176ms step_avg:98.16ms | |
| step:1103/1770 train_time:108278ms step_avg:98.17ms | |
| step:1104/1770 train_time:108381ms step_avg:98.17ms | |
| step:1105/1770 train_time:108484ms step_avg:98.18ms | |
| step:1106/1770 train_time:108589ms step_avg:98.18ms | |
| step:1107/1770 train_time:108690ms step_avg:98.18ms | |
| step:1108/1770 train_time:108792ms step_avg:98.19ms | |
| step:1109/1770 train_time:108894ms step_avg:98.19ms | |
| step:1110/1770 train_time:108995ms step_avg:98.19ms | |
| step:1111/1770 train_time:109096ms step_avg:98.20ms | |
| step:1112/1770 train_time:109198ms step_avg:98.20ms | |
| step:1113/1770 train_time:109299ms step_avg:98.20ms | |
| step:1114/1770 train_time:109402ms step_avg:98.21ms | |
| step:1115/1770 train_time:109505ms step_avg:98.21ms | |
| step:1116/1770 train_time:109608ms step_avg:98.22ms | |
| step:1117/1770 train_time:109711ms step_avg:98.22ms | |
| step:1118/1770 train_time:109811ms step_avg:98.22ms | |
| step:1119/1770 train_time:109914ms step_avg:98.23ms | |
| step:1120/1770 train_time:110015ms step_avg:98.23ms | |
| step:1121/1770 train_time:110116ms step_avg:98.23ms | |
| step:1122/1770 train_time:110218ms step_avg:98.23ms | |
| step:1123/1770 train_time:110319ms step_avg:98.24ms | |
| step:1124/1770 train_time:110423ms step_avg:98.24ms | |
| step:1125/1770 train_time:110526ms step_avg:98.24ms | |
| step:1125/1770 val_loss:3.4715 train_time:110820ms step_avg:98.51ms | |
| step:1126/1770 train_time:110829ms step_avg:98.43ms | |
| step:1127/1770 train_time:110838ms step_avg:98.35ms | |
| step:1128/1770 train_time:110846ms step_avg:98.27ms | |
| step:1129/1770 train_time:110937ms step_avg:98.26ms | |
| step:1130/1770 train_time:111039ms step_avg:98.26ms | |
| step:1131/1770 train_time:111139ms step_avg:98.27ms | |
| step:1132/1770 train_time:111240ms step_avg:98.27ms | |
| step:1133/1770 train_time:111340ms step_avg:98.27ms | |
| step:1134/1770 train_time:111440ms step_avg:98.27ms | |
| step:1135/1770 train_time:111540ms step_avg:98.27ms | |
| step:1136/1770 train_time:111641ms step_avg:98.28ms | |
| step:1137/1770 train_time:111745ms step_avg:98.28ms | |
| step:1138/1770 train_time:111850ms step_avg:98.29ms | |
| step:1139/1770 train_time:111955ms step_avg:98.29ms | |
| step:1140/1770 train_time:112056ms step_avg:98.29ms | |
| step:1141/1770 train_time:112160ms step_avg:98.30ms | |
| step:1142/1770 train_time:112261ms step_avg:98.30ms | |
| step:1143/1770 train_time:112361ms step_avg:98.30ms | |
| step:1144/1770 train_time:112462ms step_avg:98.31ms | |
| step:1145/1770 train_time:112562ms step_avg:98.31ms | |
| step:1146/1770 train_time:112663ms step_avg:98.31ms | |
| step:1147/1770 train_time:112764ms step_avg:98.31ms | |
| step:1148/1770 train_time:112869ms step_avg:98.32ms | |
| step:1149/1770 train_time:112973ms step_avg:98.32ms | |
| step:1150/1770 train_time:113077ms step_avg:98.33ms | |
| step:1151/1770 train_time:113179ms step_avg:98.33ms | |
| step:1152/1770 train_time:113281ms step_avg:98.33ms | |
| step:1153/1770 train_time:113382ms step_avg:98.34ms | |
| step:1154/1770 train_time:113483ms step_avg:98.34ms | |
| step:1155/1770 train_time:113584ms step_avg:98.34ms | |
| step:1156/1770 train_time:113686ms step_avg:98.34ms | |
| step:1157/1770 train_time:113789ms step_avg:98.35ms | |
| step:1158/1770 train_time:113891ms step_avg:98.35ms | |
| step:1159/1770 train_time:113994ms step_avg:98.36ms | |
| step:1160/1770 train_time:114097ms step_avg:98.36ms | |
| step:1161/1770 train_time:114199ms step_avg:98.36ms | |
| step:1162/1770 train_time:114300ms step_avg:98.36ms | |
| step:1163/1770 train_time:114400ms step_avg:98.37ms | |
| step:1164/1770 train_time:114502ms step_avg:98.37ms | |
| step:1165/1770 train_time:114603ms step_avg:98.37ms | |
| step:1166/1770 train_time:114703ms step_avg:98.37ms | |
| step:1167/1770 train_time:114805ms step_avg:98.38ms | |
| step:1168/1770 train_time:114907ms step_avg:98.38ms | |
| step:1169/1770 train_time:115010ms step_avg:98.38ms | |
| step:1170/1770 train_time:115114ms step_avg:98.39ms | |
| step:1171/1770 train_time:115218ms step_avg:98.39ms | |
| step:1172/1770 train_time:115320ms step_avg:98.40ms | |
| step:1173/1770 train_time:115420ms step_avg:98.40ms | |
| step:1174/1770 train_time:115521ms step_avg:98.40ms | |
| step:1175/1770 train_time:115622ms step_avg:98.40ms | |
| step:1176/1770 train_time:115723ms step_avg:98.40ms | |
| step:1177/1770 train_time:115825ms step_avg:98.41ms | |
| step:1178/1770 train_time:115926ms step_avg:98.41ms | |
| step:1179/1770 train_time:116029ms step_avg:98.41ms | |
| step:1180/1770 train_time:116131ms step_avg:98.42ms | |
| step:1181/1770 train_time:116234ms step_avg:98.42ms | |
| step:1182/1770 train_time:116337ms step_avg:98.42ms | |
| step:1183/1770 train_time:116439ms step_avg:98.43ms | |
| step:1184/1770 train_time:116540ms step_avg:98.43ms | |
| step:1185/1770 train_time:116642ms step_avg:98.43ms | |
| step:1186/1770 train_time:116744ms step_avg:98.43ms | |
| step:1187/1770 train_time:116846ms step_avg:98.44ms | |
| step:1188/1770 train_time:116950ms step_avg:98.44ms | |
| step:1189/1770 train_time:117054ms step_avg:98.45ms | |
| step:1190/1770 train_time:117158ms step_avg:98.45ms | |
| step:1191/1770 train_time:117261ms step_avg:98.46ms | |
| step:1192/1770 train_time:117365ms step_avg:98.46ms | |
| step:1193/1770 train_time:117469ms step_avg:98.46ms | |
| step:1194/1770 train_time:117572ms step_avg:98.47ms | |
| step:1195/1770 train_time:117675ms step_avg:98.47ms | |
| step:1196/1770 train_time:117778ms step_avg:98.48ms | |
| step:1197/1770 train_time:117881ms step_avg:98.48ms | |
| step:1198/1770 train_time:117983ms step_avg:98.48ms | |
| step:1199/1770 train_time:118087ms step_avg:98.49ms | |
| step:1200/1770 train_time:118189ms step_avg:98.49ms | |
| step:1201/1770 train_time:118293ms step_avg:98.50ms | |
| step:1202/1770 train_time:118397ms step_avg:98.50ms | |
| step:1203/1770 train_time:118501ms step_avg:98.50ms | |
| step:1204/1770 train_time:118603ms step_avg:98.51ms | |
| step:1205/1770 train_time:118705ms step_avg:98.51ms | |
| step:1206/1770 train_time:118808ms step_avg:98.51ms | |
| step:1207/1770 train_time:118910ms step_avg:98.52ms | |
| step:1208/1770 train_time:119017ms step_avg:98.52ms | |
| step:1209/1770 train_time:119121ms step_avg:98.53ms | |
| step:1210/1770 train_time:119223ms step_avg:98.53ms | |
| step:1211/1770 train_time:119326ms step_avg:98.54ms | |
| step:1212/1770 train_time:119429ms step_avg:98.54ms | |
| step:1213/1770 train_time:119533ms step_avg:98.54ms | |
| step:1214/1770 train_time:119638ms step_avg:98.55ms | |
| step:1215/1770 train_time:119740ms step_avg:98.55ms | |
| step:1216/1770 train_time:119843ms step_avg:98.56ms | |
| step:1217/1770 train_time:119947ms step_avg:98.56ms | |
| step:1218/1770 train_time:120053ms step_avg:98.57ms | |
| step:1219/1770 train_time:120156ms step_avg:98.57ms | |
| step:1220/1770 train_time:120258ms step_avg:98.57ms | |
| step:1221/1770 train_time:120361ms step_avg:98.58ms | |
| step:1222/1770 train_time:120463ms step_avg:98.58ms | |
| step:1223/1770 train_time:120565ms step_avg:98.58ms | |
| step:1224/1770 train_time:120669ms step_avg:98.59ms | |
| step:1225/1770 train_time:120772ms step_avg:98.59ms | |
| step:1226/1770 train_time:120877ms step_avg:98.59ms | |
| step:1227/1770 train_time:120981ms step_avg:98.60ms | |
| step:1228/1770 train_time:121083ms step_avg:98.60ms | |
| step:1229/1770 train_time:121186ms step_avg:98.61ms | |
| step:1230/1770 train_time:121290ms step_avg:98.61ms | |
| step:1231/1770 train_time:121393ms step_avg:98.61ms | |
| step:1232/1770 train_time:121497ms step_avg:98.62ms | |
| step:1233/1770 train_time:121599ms step_avg:98.62ms | |
| step:1234/1770 train_time:121701ms step_avg:98.62ms | |
| step:1235/1770 train_time:121803ms step_avg:98.63ms | |
| step:1236/1770 train_time:121906ms step_avg:98.63ms | |
| step:1237/1770 train_time:122010ms step_avg:98.63ms | |
| step:1238/1770 train_time:122115ms step_avg:98.64ms | |
| step:1239/1770 train_time:122218ms step_avg:98.64ms | |
| step:1240/1770 train_time:122319ms step_avg:98.64ms | |
| step:1241/1770 train_time:122421ms step_avg:98.65ms | |
| step:1242/1770 train_time:122523ms step_avg:98.65ms | |
| step:1243/1770 train_time:122627ms step_avg:98.65ms | |
| step:1244/1770 train_time:122731ms step_avg:98.66ms | |
| step:1245/1770 train_time:122834ms step_avg:98.66ms | |
| step:1246/1770 train_time:122939ms step_avg:98.67ms | |
| step:1247/1770 train_time:123041ms step_avg:98.67ms | |
| step:1248/1770 train_time:123145ms step_avg:98.67ms | |
| step:1249/1770 train_time:123247ms step_avg:98.68ms | |
| step:1250/1770 train_time:123350ms step_avg:98.68ms | |
| step:1250/1770 val_loss:3.4239 train_time:123648ms step_avg:98.92ms | |
| step:1251/1770 train_time:123658ms step_avg:98.85ms | |
| step:1252/1770 train_time:123667ms step_avg:98.78ms | |
| step:1253/1770 train_time:123675ms step_avg:98.70ms | |
| step:1254/1770 train_time:123766ms step_avg:98.70ms | |
| step:1255/1770 train_time:123868ms step_avg:98.70ms | |
| step:1256/1770 train_time:123970ms step_avg:98.70ms | |
| step:1257/1770 train_time:124074ms step_avg:98.71ms | |
| step:1258/1770 train_time:124177ms step_avg:98.71ms | |
| step:1259/1770 train_time:124280ms step_avg:98.71ms | |
| step:1260/1770 train_time:124383ms step_avg:98.72ms | |
| step:1261/1770 train_time:124485ms step_avg:98.72ms | |
| step:1262/1770 train_time:124592ms step_avg:98.73ms | |
| step:1263/1770 train_time:124699ms step_avg:98.73ms | |
| step:1264/1770 train_time:124802ms step_avg:98.74ms | |
| step:1265/1770 train_time:124904ms step_avg:98.74ms | |
| step:1266/1770 train_time:125007ms step_avg:98.74ms | |
| step:1267/1770 train_time:125109ms step_avg:98.74ms | |
| step:1268/1770 train_time:125212ms step_avg:98.75ms | |
| step:1269/1770 train_time:125315ms step_avg:98.75ms | |
| step:1270/1770 train_time:125419ms step_avg:98.76ms | |
| step:1271/1770 train_time:125522ms step_avg:98.76ms | |
| step:1272/1770 train_time:125625ms step_avg:98.76ms | |
| step:1273/1770 train_time:125729ms step_avg:98.77ms | |
| step:1274/1770 train_time:125833ms step_avg:98.77ms | |
| step:1275/1770 train_time:125938ms step_avg:98.78ms | |
| step:1276/1770 train_time:126040ms step_avg:98.78ms | |
| step:1277/1770 train_time:126142ms step_avg:98.78ms | |
| step:1278/1770 train_time:126244ms step_avg:98.78ms | |
| step:1279/1770 train_time:126346ms step_avg:98.79ms | |
| step:1280/1770 train_time:126450ms step_avg:98.79ms | |
| step:1281/1770 train_time:126555ms step_avg:98.79ms | |
| step:1282/1770 train_time:126659ms step_avg:98.80ms | |
| step:1283/1770 train_time:126762ms step_avg:98.80ms | |
| step:1284/1770 train_time:126866ms step_avg:98.81ms | |
| step:1285/1770 train_time:126969ms step_avg:98.81ms | |
| step:1286/1770 train_time:127072ms step_avg:98.81ms | |
| step:1287/1770 train_time:127175ms step_avg:98.82ms | |
| step:1288/1770 train_time:127279ms step_avg:98.82ms | |
| step:1289/1770 train_time:127382ms step_avg:98.82ms | |
| step:1290/1770 train_time:127483ms step_avg:98.82ms | |
| step:1291/1770 train_time:127586ms step_avg:98.83ms | |
| step:1292/1770 train_time:127690ms step_avg:98.83ms | |
| step:1293/1770 train_time:127794ms step_avg:98.84ms | |
| step:1294/1770 train_time:127899ms step_avg:98.84ms | |
| step:1295/1770 train_time:128002ms step_avg:98.84ms | |
| step:1296/1770 train_time:128104ms step_avg:98.85ms | |
| step:1297/1770 train_time:128207ms step_avg:98.85ms | |
| step:1298/1770 train_time:128310ms step_avg:98.85ms | |
| step:1299/1770 train_time:128412ms step_avg:98.85ms | |
| step:1300/1770 train_time:128516ms step_avg:98.86ms | |
| step:1301/1770 train_time:128620ms step_avg:98.86ms | |
| step:1302/1770 train_time:128723ms step_avg:98.87ms | |
| step:1303/1770 train_time:128826ms step_avg:98.87ms | |
| step:1304/1770 train_time:128930ms step_avg:98.87ms | |
| step:1305/1770 train_time:129033ms step_avg:98.88ms | |
| step:1306/1770 train_time:129136ms step_avg:98.88ms | |
| step:1307/1770 train_time:129240ms step_avg:98.88ms | |
| step:1308/1770 train_time:129341ms step_avg:98.88ms | |
| step:1309/1770 train_time:129443ms step_avg:98.89ms | |
| step:1310/1770 train_time:129545ms step_avg:98.89ms | |
| step:1311/1770 train_time:129648ms step_avg:98.89ms | |
| step:1312/1770 train_time:129751ms step_avg:98.90ms | |
| step:1313/1770 train_time:129856ms step_avg:98.90ms | |
| step:1314/1770 train_time:129961ms step_avg:98.90ms | |
| step:1315/1770 train_time:130063ms step_avg:98.91ms | |
| step:1316/1770 train_time:130166ms step_avg:98.91ms | |
| step:1317/1770 train_time:130268ms step_avg:98.91ms | |
| step:1318/1770 train_time:130371ms step_avg:98.92ms | |
| step:1319/1770 train_time:130475ms step_avg:98.92ms | |
| step:1320/1770 train_time:130580ms step_avg:98.92ms | |
| step:1321/1770 train_time:130681ms step_avg:98.93ms | |
| step:1322/1770 train_time:130785ms step_avg:98.93ms | |
| step:1323/1770 train_time:130889ms step_avg:98.93ms | |
| step:1324/1770 train_time:130994ms step_avg:98.94ms | |
| step:1325/1770 train_time:131098ms step_avg:98.94ms | |
| step:1326/1770 train_time:131200ms step_avg:98.94ms | |
| step:1327/1770 train_time:131303ms step_avg:98.95ms | |
| step:1328/1770 train_time:131405ms step_avg:98.95ms | |
| step:1329/1770 train_time:131512ms step_avg:98.96ms | |
| step:1330/1770 train_time:131614ms step_avg:98.96ms | |
| step:1331/1770 train_time:131718ms step_avg:98.96ms | |
| step:1332/1770 train_time:131822ms step_avg:98.97ms | |
| step:1333/1770 train_time:131924ms step_avg:98.97ms | |
| step:1334/1770 train_time:132028ms step_avg:98.97ms | |
| step:1335/1770 train_time:132131ms step_avg:98.97ms | |
| step:1336/1770 train_time:132234ms step_avg:98.98ms | |
| step:1337/1770 train_time:132338ms step_avg:98.98ms | |
| step:1338/1770 train_time:132441ms step_avg:98.98ms | |
| step:1339/1770 train_time:132543ms step_avg:98.99ms | |
| step:1340/1770 train_time:132646ms step_avg:98.99ms | |
| step:1341/1770 train_time:132749ms step_avg:98.99ms | |
| step:1342/1770 train_time:132852ms step_avg:99.00ms | |
| step:1343/1770 train_time:132956ms step_avg:99.00ms | |
| step:1344/1770 train_time:133060ms step_avg:99.00ms | |
| step:1345/1770 train_time:133164ms step_avg:99.01ms | |
| step:1346/1770 train_time:133267ms step_avg:99.01ms | |
| step:1347/1770 train_time:133369ms step_avg:99.01ms | |
| step:1348/1770 train_time:133472ms step_avg:99.01ms | |
| step:1349/1770 train_time:133575ms step_avg:99.02ms | |
| step:1350/1770 train_time:133679ms step_avg:99.02ms | |
| step:1351/1770 train_time:133780ms step_avg:99.02ms | |
| step:1352/1770 train_time:133884ms step_avg:99.03ms | |
| step:1353/1770 train_time:133987ms step_avg:99.03ms | |
| step:1354/1770 train_time:134090ms step_avg:99.03ms | |
| step:1355/1770 train_time:134194ms step_avg:99.04ms | |
| step:1356/1770 train_time:134298ms step_avg:99.04ms | |
| step:1357/1770 train_time:134401ms step_avg:99.04ms | |
| step:1358/1770 train_time:134503ms step_avg:99.04ms | |
| step:1359/1770 train_time:134606ms step_avg:99.05ms | |
| step:1360/1770 train_time:134709ms step_avg:99.05ms | |
| step:1361/1770 train_time:134813ms step_avg:99.05ms | |
| step:1362/1770 train_time:134918ms step_avg:99.06ms | |
| step:1363/1770 train_time:135022ms step_avg:99.06ms | |
| step:1364/1770 train_time:135125ms step_avg:99.06ms | |
| step:1365/1770 train_time:135228ms step_avg:99.07ms | |
| step:1366/1770 train_time:135331ms step_avg:99.07ms | |
| step:1367/1770 train_time:135434ms step_avg:99.07ms | |
| step:1368/1770 train_time:135538ms step_avg:99.08ms | |
| step:1369/1770 train_time:135641ms step_avg:99.08ms | |
| step:1370/1770 train_time:135743ms step_avg:99.08ms | |
| step:1371/1770 train_time:135847ms step_avg:99.09ms | |
| step:1372/1770 train_time:135951ms step_avg:99.09ms | |
| step:1373/1770 train_time:136055ms step_avg:99.09ms | |
| step:1374/1770 train_time:136161ms step_avg:99.10ms | |
| step:1375/1770 train_time:136264ms step_avg:99.10ms | |
| step:1375/1770 val_loss:3.3804 train_time:136563ms step_avg:99.32ms | |
| step:1376/1770 train_time:136573ms step_avg:99.25ms | |
| step:1377/1770 train_time:136581ms step_avg:99.19ms | |
| step:1378/1770 train_time:136590ms step_avg:99.12ms | |
| step:1379/1770 train_time:136684ms step_avg:99.12ms | |
| step:1380/1770 train_time:136785ms step_avg:99.12ms | |
| step:1381/1770 train_time:136887ms step_avg:99.12ms | |
| step:1382/1770 train_time:136989ms step_avg:99.12ms | |
| step:1383/1770 train_time:137091ms step_avg:99.13ms | |
| step:1384/1770 train_time:137195ms step_avg:99.13ms | |
| step:1385/1770 train_time:137297ms step_avg:99.13ms | |
| step:1386/1770 train_time:137400ms step_avg:99.13ms | |
| step:1387/1770 train_time:137507ms step_avg:99.14ms | |
| step:1388/1770 train_time:137614ms step_avg:99.15ms | |
| step:1389/1770 train_time:137718ms step_avg:99.15ms | |
| step:1390/1770 train_time:137821ms step_avg:99.15ms | |
| step:1391/1770 train_time:137923ms step_avg:99.15ms | |
| step:1392/1770 train_time:138024ms step_avg:99.16ms | |
| step:1393/1770 train_time:138127ms step_avg:99.16ms | |
| step:1394/1770 train_time:138229ms step_avg:99.16ms | |
| step:1395/1770 train_time:138333ms step_avg:99.16ms | |
| step:1396/1770 train_time:138438ms step_avg:99.17ms | |
| step:1397/1770 train_time:138542ms step_avg:99.17ms | |
| step:1398/1770 train_time:138648ms step_avg:99.18ms | |
| step:1399/1770 train_time:138753ms step_avg:99.18ms | |
| step:1400/1770 train_time:138857ms step_avg:99.18ms | |
| step:1401/1770 train_time:138960ms step_avg:99.19ms | |
| step:1402/1770 train_time:139061ms step_avg:99.19ms | |
| step:1403/1770 train_time:139163ms step_avg:99.19ms | |
| step:1404/1770 train_time:139265ms step_avg:99.19ms | |
| step:1405/1770 train_time:139369ms step_avg:99.19ms | |
| step:1406/1770 train_time:139475ms step_avg:99.20ms | |
| step:1407/1770 train_time:139578ms step_avg:99.20ms | |
| step:1408/1770 train_time:139683ms step_avg:99.21ms | |
| step:1409/1770 train_time:139786ms step_avg:99.21ms | |
| step:1410/1770 train_time:139890ms step_avg:99.21ms | |
| step:1411/1770 train_time:139995ms step_avg:99.22ms | |
| step:1412/1770 train_time:140097ms step_avg:99.22ms | |
| step:1413/1770 train_time:140200ms step_avg:99.22ms | |
| step:1414/1770 train_time:140302ms step_avg:99.22ms | |
| step:1415/1770 train_time:140406ms step_avg:99.23ms | |
| step:1416/1770 train_time:140511ms step_avg:99.23ms | |
| step:1417/1770 train_time:140616ms step_avg:99.24ms | |
| step:1418/1770 train_time:140720ms step_avg:99.24ms | |
| step:1419/1770 train_time:140822ms step_avg:99.24ms | |
| step:1420/1770 train_time:140925ms step_avg:99.24ms | |
| step:1421/1770 train_time:141029ms step_avg:99.25ms | |
| step:1422/1770 train_time:141133ms step_avg:99.25ms | |
| step:1423/1770 train_time:141235ms step_avg:99.25ms | |
| step:1424/1770 train_time:141337ms step_avg:99.25ms | |
| step:1425/1770 train_time:141440ms step_avg:99.26ms | |
| step:1426/1770 train_time:141543ms step_avg:99.26ms | |
| step:1427/1770 train_time:141646ms step_avg:99.26ms | |
| step:1428/1770 train_time:141751ms step_avg:99.27ms | |
| step:1429/1770 train_time:141855ms step_avg:99.27ms | |
| step:1430/1770 train_time:141958ms step_avg:99.27ms | |
| step:1431/1770 train_time:142060ms step_avg:99.27ms | |
| step:1432/1770 train_time:142163ms step_avg:99.28ms | |
| step:1433/1770 train_time:142266ms step_avg:99.28ms | |
| step:1434/1770 train_time:142370ms step_avg:99.28ms | |
| step:1435/1770 train_time:142474ms step_avg:99.29ms | |
| step:1436/1770 train_time:142578ms step_avg:99.29ms | |
| step:1437/1770 train_time:142681ms step_avg:99.29ms | |
| step:1438/1770 train_time:142788ms step_avg:99.30ms | |
| step:1439/1770 train_time:142891ms step_avg:99.30ms | |
| step:1440/1770 train_time:142994ms step_avg:99.30ms | |
| step:1441/1770 train_time:143096ms step_avg:99.30ms | |
| step:1442/1770 train_time:143199ms step_avg:99.31ms | |
| step:1443/1770 train_time:143303ms step_avg:99.31ms | |
| step:1444/1770 train_time:143405ms step_avg:99.31ms | |
| step:1445/1770 train_time:143509ms step_avg:99.31ms | |
| step:1446/1770 train_time:143616ms step_avg:99.32ms | |
| step:1447/1770 train_time:143720ms step_avg:99.32ms | |
| step:1448/1770 train_time:143824ms step_avg:99.33ms | |
| step:1449/1770 train_time:143928ms step_avg:99.33ms | |
| step:1450/1770 train_time:144033ms step_avg:99.33ms | |
| step:1451/1770 train_time:144140ms step_avg:99.34ms | |
| step:1452/1770 train_time:144243ms step_avg:99.34ms | |
| step:1453/1770 train_time:144347ms step_avg:99.34ms | |
| step:1454/1770 train_time:144451ms step_avg:99.35ms | |
| step:1455/1770 train_time:144558ms step_avg:99.35ms | |
| step:1456/1770 train_time:144662ms step_avg:99.36ms | |
| step:1457/1770 train_time:144766ms step_avg:99.36ms | |
| step:1458/1770 train_time:144871ms step_avg:99.36ms | |
| step:1459/1770 train_time:144978ms step_avg:99.37ms | |
| step:1460/1770 train_time:145082ms step_avg:99.37ms | |
| step:1461/1770 train_time:145185ms step_avg:99.37ms | |
| step:1462/1770 train_time:145289ms step_avg:99.38ms | |
| step:1463/1770 train_time:145394ms step_avg:99.38ms | |
| step:1464/1770 train_time:145500ms step_avg:99.39ms | |
| step:1465/1770 train_time:145605ms step_avg:99.39ms | |
| step:1466/1770 train_time:145709ms step_avg:99.39ms | |
| step:1467/1770 train_time:145814ms step_avg:99.40ms | |
| step:1468/1770 train_time:145920ms step_avg:99.40ms | |
| step:1469/1770 train_time:146025ms step_avg:99.40ms | |
| step:1470/1770 train_time:146129ms step_avg:99.41ms | |
| step:1471/1770 train_time:146232ms step_avg:99.41ms | |
| step:1472/1770 train_time:146336ms step_avg:99.41ms | |
| step:1473/1770 train_time:146439ms step_avg:99.42ms | |
| step:1474/1770 train_time:146544ms step_avg:99.42ms | |
| step:1475/1770 train_time:146651ms step_avg:99.42ms | |
| step:1476/1770 train_time:146755ms step_avg:99.43ms | |
| step:1477/1770 train_time:146859ms step_avg:99.43ms | |
| step:1478/1770 train_time:146964ms step_avg:99.43ms | |
| step:1479/1770 train_time:147068ms step_avg:99.44ms | |
| step:1480/1770 train_time:147172ms step_avg:99.44ms | |
| step:1481/1770 train_time:147279ms step_avg:99.45ms | |
| step:1482/1770 train_time:147384ms step_avg:99.45ms | |
| step:1483/1770 train_time:147491ms step_avg:99.45ms | |
| step:1484/1770 train_time:147595ms step_avg:99.46ms | |
| step:1485/1770 train_time:147699ms step_avg:99.46ms | |
| step:1486/1770 train_time:147804ms step_avg:99.46ms | |
| step:1487/1770 train_time:147908ms step_avg:99.47ms | |
| step:1488/1770 train_time:148013ms step_avg:99.47ms | |
| step:1489/1770 train_time:148116ms step_avg:99.47ms | |
| step:1490/1770 train_time:148222ms step_avg:99.48ms | |
| step:1491/1770 train_time:148328ms step_avg:99.48ms | |
| step:1492/1770 train_time:148431ms step_avg:99.48ms | |
| step:1493/1770 train_time:148535ms step_avg:99.49ms | |
| step:1494/1770 train_time:148639ms step_avg:99.49ms | |
| step:1495/1770 train_time:148746ms step_avg:99.50ms | |
| step:1496/1770 train_time:148854ms step_avg:99.50ms | |
| step:1497/1770 train_time:148959ms step_avg:99.51ms | |
| step:1498/1770 train_time:149062ms step_avg:99.51ms | |
| step:1499/1770 train_time:149165ms step_avg:99.51ms | |
| step:1500/1770 train_time:149269ms step_avg:99.51ms | |
| step:1500/1770 val_loss:3.3426 train_time:149570ms step_avg:99.71ms | |
| step:1501/1770 train_time:149581ms step_avg:99.65ms | |
| step:1502/1770 train_time:149590ms step_avg:99.59ms | |
| step:1503/1770 train_time:149598ms step_avg:99.53ms | |
| step:1504/1770 train_time:149690ms step_avg:99.53ms | |
| step:1505/1770 train_time:149796ms step_avg:99.53ms | |
| step:1506/1770 train_time:149899ms step_avg:99.53ms | |
| step:1507/1770 train_time:150004ms step_avg:99.54ms | |
| step:1508/1770 train_time:150107ms step_avg:99.54ms | |
| step:1509/1770 train_time:150211ms step_avg:99.54ms | |
| step:1510/1770 train_time:150314ms step_avg:99.55ms | |
| step:1511/1770 train_time:150418ms step_avg:99.55ms | |
| step:1512/1770 train_time:150524ms step_avg:99.55ms | |
| step:1513/1770 train_time:150632ms step_avg:99.56ms | |
| step:1514/1770 train_time:150737ms step_avg:99.56ms | |
| step:1515/1770 train_time:150840ms step_avg:99.56ms | |
| step:1516/1770 train_time:150943ms step_avg:99.57ms | |
| step:1517/1770 train_time:151047ms step_avg:99.57ms | |
| step:1518/1770 train_time:151151ms step_avg:99.57ms | |
| step:1519/1770 train_time:151254ms step_avg:99.57ms | |
| step:1520/1770 train_time:151359ms step_avg:99.58ms | |
| step:1521/1770 train_time:151464ms step_avg:99.58ms | |
| step:1522/1770 train_time:151569ms step_avg:99.59ms | |
| step:1523/1770 train_time:151676ms step_avg:99.59ms | |
| step:1524/1770 train_time:151780ms step_avg:99.59ms | |
| step:1525/1770 train_time:151885ms step_avg:99.60ms | |
| step:1526/1770 train_time:151989ms step_avg:99.60ms | |
| step:1527/1770 train_time:152093ms step_avg:99.60ms | |
| step:1528/1770 train_time:152196ms step_avg:99.60ms | |
| step:1529/1770 train_time:152301ms step_avg:99.61ms | |
| step:1530/1770 train_time:152407ms step_avg:99.61ms | |
| step:1531/1770 train_time:152513ms step_avg:99.62ms | |
| step:1532/1770 train_time:152619ms step_avg:99.62ms | |
| step:1533/1770 train_time:152722ms step_avg:99.62ms | |
| step:1534/1770 train_time:152827ms step_avg:99.63ms | |
| step:1535/1770 train_time:152933ms step_avg:99.63ms | |
| step:1536/1770 train_time:153036ms step_avg:99.63ms | |
| step:1537/1770 train_time:153140ms step_avg:99.64ms | |
| step:1538/1770 train_time:153244ms step_avg:99.64ms | |
| step:1539/1770 train_time:153349ms step_avg:99.64ms | |
| step:1540/1770 train_time:153456ms step_avg:99.65ms | |
| step:1541/1770 train_time:153560ms step_avg:99.65ms | |
| step:1542/1770 train_time:153666ms step_avg:99.65ms | |
| step:1543/1770 train_time:153771ms step_avg:99.66ms | |
| step:1544/1770 train_time:153877ms step_avg:99.66ms | |
| step:1545/1770 train_time:153981ms step_avg:99.66ms | |
| step:1546/1770 train_time:154085ms step_avg:99.67ms | |
| step:1547/1770 train_time:154191ms step_avg:99.67ms | |
| step:1548/1770 train_time:154296ms step_avg:99.67ms | |
| step:1549/1770 train_time:154398ms step_avg:99.68ms | |
| step:1550/1770 train_time:154502ms step_avg:99.68ms | |
| step:1551/1770 train_time:154607ms step_avg:99.68ms | |
| step:1552/1770 train_time:154712ms step_avg:99.69ms | |
| step:1553/1770 train_time:154815ms step_avg:99.69ms | |
| step:1554/1770 train_time:154920ms step_avg:99.69ms | |
| step:1555/1770 train_time:155024ms step_avg:99.69ms | |
| step:1556/1770 train_time:155127ms step_avg:99.70ms | |
| step:1557/1770 train_time:155232ms step_avg:99.70ms | |
| step:1558/1770 train_time:155336ms step_avg:99.70ms | |
| step:1559/1770 train_time:155440ms step_avg:99.70ms | |
| step:1560/1770 train_time:155544ms step_avg:99.71ms | |
| step:1561/1770 train_time:155649ms step_avg:99.71ms | |
| step:1562/1770 train_time:155754ms step_avg:99.71ms | |
| step:1563/1770 train_time:155860ms step_avg:99.72ms | |
| step:1564/1770 train_time:155963ms step_avg:99.72ms | |
| step:1565/1770 train_time:156067ms step_avg:99.72ms | |
| step:1566/1770 train_time:156172ms step_avg:99.73ms | |
| step:1567/1770 train_time:156276ms step_avg:99.73ms | |
| step:1568/1770 train_time:156380ms step_avg:99.73ms | |
| step:1569/1770 train_time:156485ms step_avg:99.74ms | |
| step:1570/1770 train_time:156589ms step_avg:99.74ms | |
| step:1571/1770 train_time:156696ms step_avg:99.74ms | |
| step:1572/1770 train_time:156801ms step_avg:99.75ms | |
| step:1573/1770 train_time:156906ms step_avg:99.75ms | |
| step:1574/1770 train_time:157010ms step_avg:99.75ms | |
| step:1575/1770 train_time:157116ms step_avg:99.76ms | |
| step:1576/1770 train_time:157220ms step_avg:99.76ms | |
| step:1577/1770 train_time:157325ms step_avg:99.76ms | |
| step:1578/1770 train_time:157429ms step_avg:99.76ms | |
| step:1579/1770 train_time:157536ms step_avg:99.77ms | |
| step:1580/1770 train_time:157642ms step_avg:99.77ms | |
| step:1581/1770 train_time:157745ms step_avg:99.78ms | |
| step:1582/1770 train_time:157851ms step_avg:99.78ms | |
| step:1583/1770 train_time:157956ms step_avg:99.78ms | |
| step:1584/1770 train_time:158060ms step_avg:99.79ms | |
| step:1585/1770 train_time:158164ms step_avg:99.79ms | |
| step:1586/1770 train_time:158270ms step_avg:99.79ms | |
| step:1587/1770 train_time:158373ms step_avg:99.79ms | |
| step:1588/1770 train_time:158479ms step_avg:99.80ms | |
| step:1589/1770 train_time:158583ms step_avg:99.80ms | |
| step:1590/1770 train_time:158687ms step_avg:99.80ms | |
| step:1591/1770 train_time:158793ms step_avg:99.81ms | |
| step:1592/1770 train_time:158897ms step_avg:99.81ms | |
| step:1593/1770 train_time:158999ms step_avg:99.81ms | |
| step:1594/1770 train_time:159104ms step_avg:99.81ms | |
| step:1595/1770 train_time:159209ms step_avg:99.82ms | |
| step:1596/1770 train_time:159314ms step_avg:99.82ms | |
| step:1597/1770 train_time:159418ms step_avg:99.82ms | |
| step:1598/1770 train_time:159522ms step_avg:99.83ms | |
| step:1599/1770 train_time:159625ms step_avg:99.83ms | |
| step:1600/1770 train_time:159731ms step_avg:99.83ms | |
| step:1601/1770 train_time:159836ms step_avg:99.84ms | |
| step:1602/1770 train_time:159942ms step_avg:99.84ms | |
| step:1603/1770 train_time:160046ms step_avg:99.84ms | |
| step:1604/1770 train_time:160151ms step_avg:99.84ms | |
| step:1605/1770 train_time:160255ms step_avg:99.85ms | |
| step:1606/1770 train_time:160359ms step_avg:99.85ms | |
| step:1607/1770 train_time:160464ms step_avg:99.85ms | |
| step:1608/1770 train_time:160569ms step_avg:99.86ms | |
| step:1609/1770 train_time:160679ms step_avg:99.86ms | |
| step:1610/1770 train_time:160784ms step_avg:99.87ms | |
| step:1611/1770 train_time:160890ms step_avg:99.87ms | |
| step:1612/1770 train_time:160995ms step_avg:99.87ms | |
| step:1613/1770 train_time:161100ms step_avg:99.88ms | |
| step:1614/1770 train_time:161204ms step_avg:99.88ms | |
| step:1615/1770 train_time:161308ms step_avg:99.88ms | |
| step:1616/1770 train_time:161413ms step_avg:99.88ms | |
| step:1617/1770 train_time:161519ms step_avg:99.89ms | |
| step:1618/1770 train_time:161624ms step_avg:99.89ms | |
| step:1619/1770 train_time:161729ms step_avg:99.89ms | |
| step:1620/1770 train_time:161836ms step_avg:99.90ms | |
| step:1621/1770 train_time:161940ms step_avg:99.90ms | |
| step:1622/1770 train_time:162046ms step_avg:99.90ms | |
| step:1623/1770 train_time:162150ms step_avg:99.91ms | |
| step:1624/1770 train_time:162254ms step_avg:99.91ms | |
| step:1625/1770 train_time:162361ms step_avg:99.91ms | |
| step:1625/1770 val_loss:3.3080 train_time:162662ms step_avg:100.10ms | |
| step:1626/1770 train_time:162673ms step_avg:100.04ms | |
| step:1627/1770 train_time:162682ms step_avg:99.99ms | |
| step:1628/1770 train_time:162690ms step_avg:99.93ms | |
| step:1629/1770 train_time:162782ms step_avg:99.93ms | |
| step:1630/1770 train_time:162885ms step_avg:99.93ms | |
| step:1631/1770 train_time:162988ms step_avg:99.93ms | |
| step:1632/1770 train_time:163090ms step_avg:99.93ms | |
| step:1633/1770 train_time:163193ms step_avg:99.93ms | |
| step:1634/1770 train_time:163295ms step_avg:99.94ms | |
| step:1635/1770 train_time:163399ms step_avg:99.94ms | |
| step:1636/1770 train_time:163502ms step_avg:99.94ms | |
| step:1637/1770 train_time:163611ms step_avg:99.95ms | |
| step:1638/1770 train_time:163718ms step_avg:99.95ms | |
| step:1639/1770 train_time:163824ms step_avg:99.95ms | |
| step:1640/1770 train_time:163927ms step_avg:99.96ms | |
| step:1641/1770 train_time:164031ms step_avg:99.96ms | |
| step:1642/1770 train_time:164135ms step_avg:99.96ms | |
| step:1643/1770 train_time:164238ms step_avg:99.96ms | |
| step:1644/1770 train_time:164341ms step_avg:99.96ms | |
| step:1645/1770 train_time:164446ms step_avg:99.97ms | |
| step:1646/1770 train_time:164552ms step_avg:99.97ms | |
| step:1647/1770 train_time:164658ms step_avg:99.97ms | |
| step:1648/1770 train_time:164767ms step_avg:99.98ms | |
| step:1649/1770 train_time:164873ms step_avg:99.98ms | |
| step:1650/1770 train_time:164976ms step_avg:99.99ms | |
| step:1651/1770 train_time:165080ms step_avg:99.99ms | |
| step:1652/1770 train_time:165184ms step_avg:99.99ms | |
| step:1653/1770 train_time:165286ms step_avg:99.99ms | |
| step:1654/1770 train_time:165391ms step_avg:99.99ms | |
| step:1655/1770 train_time:165495ms step_avg:100.00ms | |
| step:1656/1770 train_time:165602ms step_avg:100.00ms | |
| step:1657/1770 train_time:165708ms step_avg:100.00ms | |
| step:1658/1770 train_time:165813ms step_avg:100.01ms | |
| step:1659/1770 train_time:165919ms step_avg:100.01ms | |
| step:1660/1770 train_time:166024ms step_avg:100.01ms | |
| step:1661/1770 train_time:166129ms step_avg:100.02ms | |
| step:1662/1770 train_time:166231ms step_avg:100.02ms | |
| step:1663/1770 train_time:166335ms step_avg:100.02ms | |
| step:1664/1770 train_time:166439ms step_avg:100.02ms | |
| step:1665/1770 train_time:166544ms step_avg:100.03ms | |
| step:1666/1770 train_time:166650ms step_avg:100.03ms | |
| step:1667/1770 train_time:166755ms step_avg:100.03ms | |
| step:1668/1770 train_time:166860ms step_avg:100.04ms | |
| step:1669/1770 train_time:166965ms step_avg:100.04ms | |
| step:1670/1770 train_time:167069ms step_avg:100.04ms | |
| step:1671/1770 train_time:167172ms step_avg:100.04ms | |
| step:1672/1770 train_time:167276ms step_avg:100.05ms | |
| step:1673/1770 train_time:167381ms step_avg:100.05ms | |
| step:1674/1770 train_time:167486ms step_avg:100.05ms | |
| step:1675/1770 train_time:167592ms step_avg:100.05ms | |
| step:1676/1770 train_time:167695ms step_avg:100.06ms | |
| step:1677/1770 train_time:167801ms step_avg:100.06ms | |
| step:1678/1770 train_time:167909ms step_avg:100.06ms | |
| step:1679/1770 train_time:168016ms step_avg:100.07ms | |
| step:1680/1770 train_time:168119ms step_avg:100.07ms | |
| step:1681/1770 train_time:168223ms step_avg:100.07ms | |
| step:1682/1770 train_time:168326ms step_avg:100.08ms | |
| step:1683/1770 train_time:168430ms step_avg:100.08ms | |
| step:1684/1770 train_time:168537ms step_avg:100.08ms | |
| step:1685/1770 train_time:168642ms step_avg:100.08ms | |
| step:1686/1770 train_time:168748ms step_avg:100.09ms | |
| step:1687/1770 train_time:168853ms step_avg:100.09ms | |
| step:1688/1770 train_time:168960ms step_avg:100.09ms | |
| step:1689/1770 train_time:169068ms step_avg:100.10ms | |
| step:1690/1770 train_time:169172ms step_avg:100.10ms | |
| step:1691/1770 train_time:169275ms step_avg:100.10ms | |
| step:1692/1770 train_time:169379ms step_avg:100.11ms | |
| step:1693/1770 train_time:169484ms step_avg:100.11ms | |
| step:1694/1770 train_time:169590ms step_avg:100.11ms | |
| step:1695/1770 train_time:169694ms step_avg:100.11ms | |
| step:1696/1770 train_time:169799ms step_avg:100.12ms | |
| step:1697/1770 train_time:169905ms step_avg:100.12ms | |
| step:1698/1770 train_time:170011ms step_avg:100.12ms | |
| step:1699/1770 train_time:170116ms step_avg:100.13ms | |
| step:1700/1770 train_time:170220ms step_avg:100.13ms | |
| step:1701/1770 train_time:170324ms step_avg:100.13ms | |
| step:1702/1770 train_time:170427ms step_avg:100.13ms | |
| step:1703/1770 train_time:170531ms step_avg:100.14ms | |
| step:1704/1770 train_time:170634ms step_avg:100.14ms | |
| step:1705/1770 train_time:170739ms step_avg:100.14ms | |
| step:1706/1770 train_time:170848ms step_avg:100.15ms | |
| step:1707/1770 train_time:170953ms step_avg:100.15ms | |
| step:1708/1770 train_time:171058ms step_avg:100.15ms | |
| step:1709/1770 train_time:171165ms step_avg:100.15ms | |
| step:1710/1770 train_time:171269ms step_avg:100.16ms | |
| step:1711/1770 train_time:171374ms step_avg:100.16ms | |
| step:1712/1770 train_time:171485ms step_avg:100.17ms | |
| step:1713/1770 train_time:171594ms step_avg:100.17ms | |
| step:1714/1770 train_time:171698ms step_avg:100.17ms | |
| step:1715/1770 train_time:171805ms step_avg:100.18ms | |
| step:1716/1770 train_time:171909ms step_avg:100.18ms | |
| step:1717/1770 train_time:172015ms step_avg:100.18ms | |
| step:1718/1770 train_time:172121ms step_avg:100.19ms | |
| step:1719/1770 train_time:172226ms step_avg:100.19ms | |
| step:1720/1770 train_time:172330ms step_avg:100.19ms | |
| step:1721/1770 train_time:172437ms step_avg:100.20ms | |
| step:1722/1770 train_time:172545ms step_avg:100.20ms | |
| step:1723/1770 train_time:172651ms step_avg:100.20ms | |
| step:1724/1770 train_time:172758ms step_avg:100.21ms | |
| step:1725/1770 train_time:172864ms step_avg:100.21ms | |
| step:1726/1770 train_time:172971ms step_avg:100.21ms | |
| step:1727/1770 train_time:173078ms step_avg:100.22ms | |
| step:1728/1770 train_time:173185ms step_avg:100.22ms | |
| step:1729/1770 train_time:173291ms step_avg:100.23ms | |
| step:1730/1770 train_time:173397ms step_avg:100.23ms | |
| step:1731/1770 train_time:173503ms step_avg:100.23ms | |
| step:1732/1770 train_time:173608ms step_avg:100.24ms | |
| step:1733/1770 train_time:173713ms step_avg:100.24ms | |
| step:1734/1770 train_time:173819ms step_avg:100.24ms | |
| step:1735/1770 train_time:173926ms step_avg:100.25ms | |
| step:1736/1770 train_time:174030ms step_avg:100.25ms | |
| step:1737/1770 train_time:174135ms step_avg:100.25ms | |
| step:1738/1770 train_time:174240ms step_avg:100.25ms | |
| step:1739/1770 train_time:174347ms step_avg:100.26ms | |
| step:1740/1770 train_time:174452ms step_avg:100.26ms | |
| step:1741/1770 train_time:174556ms step_avg:100.26ms | |
| step:1742/1770 train_time:174662ms step_avg:100.26ms | |
| step:1743/1770 train_time:174769ms step_avg:100.27ms | |
| step:1744/1770 train_time:174876ms step_avg:100.27ms | |
| step:1745/1770 train_time:174982ms step_avg:100.28ms | |
| step:1746/1770 train_time:175088ms step_avg:100.28ms | |
| step:1747/1770 train_time:175193ms step_avg:100.28ms | |
| step:1748/1770 train_time:175301ms step_avg:100.29ms | |
| step:1749/1770 train_time:175406ms step_avg:100.29ms | |
| step:1750/1770 train_time:175514ms step_avg:100.29ms | |
| step:1750/1770 val_loss:3.2813 train_time:175816ms step_avg:100.47ms | |
| step:1751/1770 train_time:175826ms step_avg:100.41ms | |
| step:1752/1770 train_time:175836ms step_avg:100.36ms | |
| step:1753/1770 train_time:175844ms step_avg:100.31ms | |
| step:1754/1770 train_time:175935ms step_avg:100.31ms | |
| step:1755/1770 train_time:176040ms step_avg:100.31ms | |
| step:1756/1770 train_time:176144ms step_avg:100.31ms | |
| step:1757/1770 train_time:176248ms step_avg:100.31ms | |
| step:1758/1770 train_time:176352ms step_avg:100.31ms | |
| step:1759/1770 train_time:176457ms step_avg:100.32ms | |
| step:1760/1770 train_time:176561ms step_avg:100.32ms | |
| step:1761/1770 train_time:176667ms step_avg:100.32ms | |
| step:1762/1770 train_time:176778ms step_avg:100.33ms | |
| step:1763/1770 train_time:176885ms step_avg:100.33ms | |
| step:1764/1770 train_time:176995ms step_avg:100.34ms | |
| step:1765/1770 train_time:177099ms step_avg:100.34ms | |
| step:1766/1770 train_time:177202ms step_avg:100.34ms | |
| step:1767/1770 train_time:177307ms step_avg:100.34ms | |
| step:1768/1770 train_time:177415ms step_avg:100.35ms | |
| step:1769/1770 train_time:177517ms step_avg:100.35ms | |
| step:1770/1770 train_time:177622ms step_avg:100.35ms | |
| step:1770/1770 val_loss:3.2782 train_time:177930ms step_avg:100.53ms | |
| peak memory allocated: 29784 MiB reserved: 40536 MiB | |