Castillo_Henry_903002104 / records /053025_noallreduce /8054c239-3a18-499e-b0c8-dbd27cb4b3ab.txt
henrycastillo's picture
add everything but lm eval harness
c3b20da verified
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
import copy
import glob
from dataclasses import dataclass
from functools import lru_cache, partial # Added partial for hook registration
from pathlib import Path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems
from torch import Tensor, nn, autocast
import torch.nn.functional as F
import torch.distributed as dist
# use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import BlockMask, flex_attention
#torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min
#import wandb
# -----------------------------------------------------------------------------
# Custom operators: FP8 matmul by @YouJiacheng
@torch.library.custom_op("nanogpt::mm", mutates_args=())
def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]:
@torch.compile
def impl(x: Tensor, w: Tensor):
assert x.is_contiguous() and w.is_contiguous()
x_f8 = x.div(x_s).to(torch.float8_e4m3fn)
w_f8 = w.div(w_s).to(torch.float8_e4m3fn)
out = torch._scaled_mm(
x_f8,
w_f8.T,
out_dtype=torch.bfloat16,
scale_a=x.new_tensor(x_s, dtype=torch.float32),
scale_b=x.new_tensor(w_s, dtype=torch.float32),
use_fast_accum=True,
)
return out, x_f8, w_f8
return impl(x, w)
@mm_op.register_fake
def _(x: Tensor, w: Tensor, *_):
assert x.ndim == w.ndim == 2
assert x.shape[1] == w.shape[1]
assert x.device == w.device
assert x.is_contiguous() and w.is_contiguous()
return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn)
@torch.library.custom_op("nanogpt::mm_backward", mutates_args=())
def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]:
@torch.compile
def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor):
assert grad.is_contiguous()
x_inv_s = grad.new_tensor(x_s, dtype=torch.float32)
w_inv_s = grad.new_tensor(w_s, dtype=torch.float32)
grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32)
grad_f8 = grad.div(grad_s).to(torch.float8_e5m2)
grad_x = torch._scaled_mm(
grad_f8,
w_f8.T.contiguous().T,
out_dtype=torch.bfloat16,
scale_a=grad_inv_s,
scale_b=w_inv_s,
use_fast_accum=False,
)
# faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768)
grad_w = torch._scaled_mm(
x_f8.T.contiguous(),
grad_f8.T.contiguous().T,
out_dtype=torch.float32,
scale_a=x_inv_s,
scale_b=grad_inv_s,
use_fast_accum=False,
).T
return grad_x, grad_w
return impl(g, x_f8, w_f8)
@mm_backward_op.register_fake
def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_):
return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32)
def backward(ctx, grad_out: Tensor, *_):
x_f8, w_f8 = ctx.saved_tensors
x_s, w_s, grad_s = ctx.scales
grad_x, grad_w = torch.ops.nanogpt.mm_backward(
grad_out, x_f8, w_f8, x_s, w_s, grad_s
)
return grad_x, grad_w, None, None, None
def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output):
*_, x_s, w_s, grad_s = inputs
_, x_f8, w_f8 = output
ctx.save_for_backward(x_f8, w_f8)
ctx.scales = x_s, w_s, grad_s
ctx.set_materialize_grads(False)
mm_op.register_autograd(backward, setup_context=setup_context)
# -----------------------------------------------------------------------------
# Muon optimizer
@torch.compile(mode="reduce-overhead", fullgraph=True, dynamic=False)
def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
a, b, c = (3.4445, -4.7750, 2.0315)
X = G.bfloat16()
if G.size(-2) > G.size(-1):
X = X.mT
# Ensure spectral norm is at most 1
X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
# Perform the NS iterations
for _ in range(steps):
A = X @ X.mT
B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(-2) > G.size(-1):
X = X.mT
return X.type_as(G)
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
https://kellerjordan.github.io/posts/muon/
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Warning: This optimizer should not be used for the embedding layer, the final fully connected layer,
or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
"""
def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95, rank=0, world_size=1):
self.rank = rank
self.world_size = world_size
defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum)
params = list(params)
sizes = {p.shape for p in params}
# create one buffer per unique parameter-size
param_groups = []
for size in sizes:
group_params = [p for p in params if p.shape == size]
param_groups.append(dict(params=group_params,))
super().__init__(param_groups, defaults)
@torch.no_grad()
def step(self):
futures: list[torch.Future] = []
reduce_scatter_futures: list[torch.Future] = []
for group in self.param_groups:
params: list[Tensor] = group["params"]
grad = torch.empty_like(params[-1])
grad_pad = [param.grad for param in params] + [torch.zeros_like(params[-1])] * self.world_size
for base_i in range(0, len(params), self.world_size):
if base_i + self.rank < len(params):
grad = params[base_i + self.rank].grad
# This gives strange dynamo warnings
reduce_scatter_futures.append(dist.reduce_scatter(grad, grad_pad[base_i:base_i + self.world_size], op=dist.ReduceOp.AVG, async_op=True).get_future())
idx = 0
for group in self.param_groups:
params: list[Tensor] = group["params"]
params_pad = params + [torch.empty_like(params[-1])] * self.world_size
momentum = group["momentum"]
for base_i in range(0, len(params), self.world_size):
reduce_scatter_futures[idx].wait()
if base_i + self.rank < len(params):
p = params[base_i + self.rank]
grad = p.grad
eff_lr = group["lr"] * max(1, p.size(-2) / p.size(-1)) ** 0.5 * getattr(p, "lr_mul", 1.0)
eff_weight_decay = group["lr"] * group["weight_decay"] * getattr(p, "wd_mul", 1.0)
state = self.state[p]
if len(state) == 0:
state["momentum_buffer"] = torch.zeros_like(grad)
momentum_buffer = state["momentum_buffer"]
p.mul_(1 - eff_weight_decay)
momentum_buffer.lerp_(grad, 1 - momentum)
grad = grad.lerp_(momentum_buffer, momentum)
v = zeropower_via_newtonschulz5(grad, 5)
p.add_(other=v, alpha=-eff_lr)
idx += 1
futures.append(dist.all_gather(params_pad[base_i:base_i + self.world_size], params_pad[base_i + self.rank], async_op=True).get_future())
# TODO: Check if commenting it is dangerous
torch.futures.collect_all(futures).wait()
class DistAdam(torch.optim.Optimizer):
def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01, rank: int = 0, world_size: int = 1):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
params = list(params)
sizes = {p.shape for p in params}
self.rank = rank
self.world_size = world_size
# create one buffer per unique parameter-size
param_groups = []
for size in sizes:
group_params = [p for p in params if p.shape == size]
param_groups.append(dict(
params=group_params,
))
super().__init__(param_groups, defaults)
@torch.no_grad()
def step(self):
futures: list[torch.Future] = []
reduce_scatter_futures: list[torch.Future] = []
grad_slices = []
for group in self.param_groups:
params: list[Tensor] = group["params"]
grad = torch.empty_like(params[-1])
for base_i in range(len(params)):
grad = params[base_i].grad
rank_size = grad.shape[0] // self.world_size
grad_slice = torch.empty_like(grad[:rank_size])
reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future())
grad_slices.append(grad_slice)
idx = 0
for group in self.param_groups:
beta1, beta2 = group['betas']
eps = group['eps']
wd = group['weight_decay']
params = group['params']
for base in range(len(params)):
reduce_scatter_futures[idx].wait()
p = params[base]
rank_size = p.shape[0] // self.world_size
p_slice = p[rank * rank_size:(rank + 1) * rank_size]
lr = group['lr'] * getattr(p, "lr_mul", 1.0)
state = self.state[p]
g_slice = grad_slices[idx]
# State init
if not state:
state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device)
state['exp_avg'] = torch.zeros_like(p_slice)
state['exp_avg_sq'] = torch.zeros_like(p_slice)
exp_avg = state['exp_avg']
exp_avg_sq = state['exp_avg_sq']
state['step'] += 1
t = state['step']
# weight decay
if wd != 0:
eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0)
p_slice.mul_(1 - eff_weight_decay)
# update running averages
exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2)
# bias corrections
bias1 = 1 - beta1 ** t
bias2 = 1 - beta2 ** t
# compute step
denom = exp_avg_sq.sqrt().add_(eps)
step_size = lr * (torch.sqrt(bias2) / bias1)
update = exp_avg.div(denom).mul_(step_size)
p_slice.add_(other=update, alpha=-1.0)
idx += 1
futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future())
# TODO: Check if commenting it is dangerous
torch.futures.collect_all(futures).wait()
# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the model
def norm(x: Tensor):
return F.rms_norm(x, (x.size(-1),))
class CastedLinear(nn.Linear):
def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0):
super().__init__(in_features, out_features, bias=False)
self.use_fp8 = use_fp8
self.x_s = x_s
self.w_s = w_s
self.grad_s = grad_s
def reset_parameters(self) -> None:
std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3)
bound = (3 ** 0.5) * std
with torch.no_grad():
self.weight.uniform_(-bound, bound)
def forward(self, x: Tensor):
if self.use_fp8 and self.training:
_x = x.flatten(0, -2)
out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0]
return out.reshape(*x.shape[:-1], -1)
else:
return F.linear(x, self.weight)
class Rotary(nn.Module):
def __init__(self, dim: int, max_seq_len: int):
super().__init__()
# half-truncate RoPE by @YouJiacheng (w/ base freq tuning)
angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32)
angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)])
t = torch.arange(max_seq_len, dtype=torch.float32)
theta = torch.einsum("i,j -> ij", t, angular_freq)
self.cos = nn.Buffer(theta.cos(), persistent=False)
self.sin = nn.Buffer(theta.sin(), persistent=False)
def forward(self, x_BTHD: Tensor):
assert self.cos.size(0) >= x_BTHD.size(-3)
cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :]
x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1)
y1 = x1 * cos + x2 * sin
y2 = x1 * (-sin) + x2 * cos
return torch.cat((y1, y2), 3).type_as(x_BTHD)
class CausalSelfAttention(nn.Module):
def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128):
super().__init__()
self.num_heads = num_heads
self.head_dim = head_dim
hdim = num_heads * head_dim
std = 0.5 * (dim ** -0.5)
bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng
# merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng
# https://x.com/hi_tysam/status/1879699187107033311
self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound))
self.rotary = Rotary(head_dim, max_seq_len)
self.c_proj = CastedLinear(hdim, dim)
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977
# scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun
# inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283
self.attn_scale = 0.12
def forward(self, x: Tensor, ve: Tensor | None, lambdas: Tensor, block_mask: BlockMask):
B, T = x.size(0), x.size(1) # batch size, sequence length
assert B == 1, "Must use batch size = 1 for FlexAttention"
q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2)
q, k = norm(q), norm(k) # QK norm @Grad62304977
q, k = self.rotary(q), self.rotary(k)
if ve is not None:
v = lambdas[0] * v + lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977
else: # skip mid-layers token value embeddings by @YouJiacheng
v = lambdas[0] * v
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=self.attn_scale).transpose(1, 2)
y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side
y = self.c_proj(y)
return y
class MLP(nn.Module):
def __init__(self, dim: int):
super().__init__()
hdim = 4 * dim
self.c_fc = CastedLinear(dim, hdim)
self.c_proj = CastedLinear(hdim, dim)
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977
def forward(self, x: Tensor):
x = self.c_fc(x)
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int):
super().__init__()
# skip attention of blocks.7 (the 8th layer) by @YouJiacheng
self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None
self.mlp = MLP(dim)
def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, lambdas: Tensor, sa_lambdas: Tensor, block_mask: BlockMask):
x = lambdas[0] * x + lambdas[1] * x0
if self.attn is not None:
x = x + self.attn(norm(x), ve, sa_lambdas, block_mask)
x = x + self.mlp(norm(x))
return x
# -----------------------------------------------------------------------------
# The main model
def next_multiple_of_n(v: float | int, *, n: int):
return next(x for x in range(n, int(v) + 1 + n, n) if x >= v)
class GPT(nn.Module):
def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int):
super().__init__()
self.embed = nn.Embedding(vocab_size, model_dim)
for param in self.embed.parameters():
param.lr_mul = 75.
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897
# value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78
self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)])
for embeds in self.value_embeds:
for param in self.value_embeds.parameters():
param.lr_mul = 75.
self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)])
# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency.
# suggested to me by @Grad62304977. this originates from Karpathy's experiments.
self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128), use_fp8=True, x_s=(model_dim**0.5)/448, w_s=24/448, grad_s=1/448)
self.lm_head.weight.lr_mul = 27.5
self.lm_head.weight.detach().zero_() # @Grad62304977
# Add learnable skip connection weights for decoder layers
assert num_layers % 2 == 0
pad = (-num_layers * 5) % world_size
self.scalars = nn.Parameter(torch.cat([
torch.ones(num_layers), # skip_weights
*[torch.tensor([1.0, 0.0]) for _ in range(num_layers)], # block lambdas
*[torch.tensor([0.5, 0.5]) for _ in range(num_layers)], # SA lambdas
torch.ones(pad),
]))
self.scalars.lr_mul = 5.0
def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor):
BLOCK_SIZE = 128
docs = (input_seq == 50256).cumsum(0)
def document_causal(b, h, q_idx, kv_idx):
causal_mask = q_idx >= kv_idx
#return causal_mask
document_mask = docs[q_idx] == docs[kv_idx]
return causal_mask & document_mask
def dense_to_ordered(dense_blockmask: Tensor):
num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32)
indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32)
return num_blocks[None, None].contiguous(), indices[None, None].contiguous()
# manual block mask creation by @YouJiacheng
assert len(input_seq) % BLOCK_SIZE == 0
NUM_BLOCKS = len(input_seq) // BLOCK_SIZE
block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda")
causal_blockmask_any = block_idx[:, None] >= block_idx
causal_blockmask_all = block_idx[:, None] > block_idx
docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous()
docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous()
document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low)
document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low)
blockmask_any = causal_blockmask_any & document_blockmask_any
blockmask_all = causal_blockmask_all & document_blockmask_all
partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all)
full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all)
def build_bm(window_size_blocks: Tensor) -> BlockMask:
return BlockMask.from_kv_blocks(
torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)),
partial_kv_indices,
torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1),
full_kv_indices,
BLOCK_SIZE=BLOCK_SIZE,
mask_mod=document_causal,
)
# Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper
return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2)
def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor):
assert input_seq.ndim == 1
ve = [value_embed(input_seq) for value_embed in self.value_embeds]
# 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure
ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]]
assert len(ve) == len(self.blocks)
long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks)
block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm]
assert len(block_masks) == len(self.blocks)
x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977
# U-net design by @brendanh0gan
skip_connections = []
skip_weights = self.scalars[:(len(self.blocks) // 2)]
lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2)
sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2)
n = len(self.blocks) // 2
for i in range(len(self.blocks)):
if i >= n:
x = x + skip_weights[i - n] * skip_connections.pop()
x = self.blocks[i](x, ve[i], x0, lambdas[i], sa_lambdas[i], block_masks[i])
if i < n:
skip_connections.append(x)
x = norm(x)
logits = self.lm_head(x).float()
# @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1)
logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1)**0.5))
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction='sum' if self.training else 'mean')
return loss
# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader
def _load_data_shard(file: Path):
header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32
assert header[0] == 20240520, "magic number mismatch in the data .bin file"
assert header[1] == 1, "unsupported version"
num_tokens = int(header[2]) # number of tokens (claimed)
with file.open("rb", buffering=0) as f:
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng
f.seek(256 * 4)
nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng
assert nbytes == 2 * num_tokens, "number of tokens read does not match header"
return tokens
def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int):
files = [Path(file) for file in sorted(glob.glob(filename_pattern))]
assert batch_size % world_size == 0
local_batch_size = batch_size // world_size
file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training
tokens, pos = _load_data_shard(next(file_iter)), 0
while True:
if pos + batch_size + 1 >= len(tokens):
tokens, pos = _load_data_shard(next(file_iter)), 0
buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1]
inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side;
targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful.
pos += batch_size
yield inputs, targets
# -----------------------------------------------------------------------------
# int main
@dataclass
class Hyperparameters:
# data
train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on
val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on
val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
# optimization
num_iterations = 1770 # number of iterations to run
cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate
# evaluation and logging
val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end
# implementation
seq_len = 48*1024 # FlexAttention sequence length
val_seq_len = 4*64*1024 # FlexAttention sequence length for validation
save_checkpoint = False
args = Hyperparameters()
# torchrun sets these env variables
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
assert torch.cuda.is_available()
device = torch.device("cuda", int(os.environ["LOCAL_RANK"]))
torch.cuda.set_device(device)
dist.init_process_group(backend="nccl", device_id=device)
dist.barrier()
master_process = (rank == 0) # this process will do logging, checkpointing etc.
#if master_process:
# wandb.init(project="modded-nanogpt-tiny", name=f"run-{os.path.basename(__file__)}", save_code=True)
# begin logging
logfile = None
if master_process:
run_id = uuid.uuid4()
os.makedirs("logs", exist_ok=True)
logfile = f"logs/{run_id}.txt"
print(logfile)
def print0(s, console=True):
if master_process:
with open(logfile, "a") as f:
if console:
print(s)
print(s, file=f)
# begin by printing this file (the Python code)
print0(code)
print0("="*100)
# log information about the hardware/software environment this is running on
print0(f"Running Python {sys.version}")
print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}")
def nvidia_smi():
import subprocess # avoid top level import
return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout
if master_process:
print0(nvidia_smi())
print0("="*100)
model: nn.Module = GPT(vocab_size=next_multiple_of_n(50257, n=128), num_layers=12, num_heads=6, model_dim=768, max_seq_len=max(args.seq_len, args.val_seq_len)).cuda()
for m in model.modules():
if isinstance(m, nn.Embedding):
m.bfloat16()
for param in model.parameters():
dist.broadcast(param.detach(), 0)
# collect the parameters to optimize
hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n]
embed_params = [p for n, p in model.named_parameters() if "embed" in n]
scalar_params = [p for p in model.parameters() if p.ndim < 2]
head_params = [model.lm_head.weight]
# init the optimizer(s)
# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence
# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094
optimizer1 = DistAdam(scalar_params + head_params + embed_params, lr=0.008, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0, rank=rank, world_size=world_size)
optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size, weight_decay=0.0)
optimizers = [optimizer1, optimizer2]
for opt in optimizers:
for group in opt.param_groups:
group["initial_lr"] = group["lr"]
for n, p in model.named_parameters():
wd_mul = getattr(p, "wd_mul", 1.0)
lr_mul = getattr(p, "lr_mul", 1.0)
print0(f"{n}: {p.shape} {p.dtype} {wd_mul} {lr_mul}")
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
embedding_params = sum(p.numel() for n, p in model.named_parameters() if "embed" in n)
non_embedding_params = total_params - embedding_params
print0(f"")
print0(f"Model parameters:")
print0(f" Total parameters: {total_params:,}")
print0(f" Embedding parameters: {embedding_params:,}")
print0(f" Non-embedding parameters: {non_embedding_params:,}")
# learning rate schedule: stable then decay
def get_lr(step: int):
x = step / args.num_iterations # progress in training
assert 0 <= x <= 1
w = min((1 - x) / args.cooldown_frac, 1.0) # 1 -> 0
return w * 1.0 + (1 - w) * 0.1
@lru_cache(1)
def get_window_size_blocks_helper(window_size: int):
return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
def get_window_size_blocks(step: int):
x = step / args.num_iterations # progress in training
assert 0 <= x <= 1
# Linearly increase the block-wise sliding window size over training 128 -> 1792
# increase by @fernbear.bsky.social; block-wise by @YouJiacheng
window_size = next_multiple_of_n(1728 * x, n=128)
return get_window_size_blocks_helper(window_size)
model: nn.Module = torch.compile(model, mode="reduce-overhead", fullgraph=True, dynamic=False)
# Warmup the training kernels, then re-initialize the state so we aren't cheating
warmup_steps = 10
initial_state = dict(model=copy.deepcopy(model.state_dict()),
optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state
train_loader = distributed_data_generator(args.train_files, world_size * args.seq_len, rank, world_size)
for _ in range(warmup_steps):
inputs, targets = next(train_loader)
torch.compiler.cudagraph_mark_step_begin()
with autocast(device_type="cuda", dtype=torch.bfloat16):
loss = model(inputs, targets, get_window_size_blocks(1))
loss.backward()
for opt in optimizers:
opt.step()
model.zero_grad(set_to_none=True)
torch.cuda.synchronize()
dist.barrier()
with torch.profiler.profile() as prof:
for _ in range(warmup_steps):
torch.compiler.cudagraph_mark_step_begin()
inputs, targets = next(train_loader)
with autocast(device_type="cuda", dtype=torch.bfloat16):
loss = model(inputs, targets, get_window_size_blocks(1))
loss.backward()
for opt in optimizers:
opt.step()
model.zero_grad(set_to_none=True)
torch.cuda.synchronize()
dist.barrier()
os.makedirs("traces", exist_ok=True)
prof.export_chrome_trace(f"traces/trace_{rank}.json")
model.load_state_dict(initial_state['model'])
for opt, opt_state in zip(optimizers, initial_state['optimizers']):
opt.load_state_dict(opt_state)
del train_loader, initial_state
train_loader = distributed_data_generator(args.train_files, world_size * args.seq_len, rank, world_size)
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
train_steps = args.num_iterations
for step in range(train_steps + 1):
last_step = (step == train_steps)
torch.compiler.cudagraph_mark_step_begin()
# --------------- VALIDATION SECTION -----------------
if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.perf_counter() - t0)
model.eval()
val_batch_size = world_size * args.val_seq_len
assert args.val_tokens % val_batch_size == 0
val_steps = args.val_tokens // val_batch_size
val_loader = distributed_data_generator(args.val_files, val_batch_size, rank, world_size)
val_loss = 0
with torch.no_grad():
for _ in range(val_steps):
inputs, targets = next(val_loader)
with autocast(device_type="cuda", dtype=torch.bfloat16):
val_loss += model(inputs, targets, get_window_size_blocks(step))
val_loss /= val_steps
del val_loader
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
#if master_process:
# wandb.log({"val/loss": val_loss}, step=step)
print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True)
model.train()
# start the clock again
torch.cuda.synchronize()
t0 = time.perf_counter()
if last_step:
if master_process and args.save_checkpoint:
log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
os.makedirs(f"logs/{run_id}", exist_ok=True)
torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt")
# the last step only has the validation loop, so break to avoid training
break
# --------------- TRAINING SECTION -----------------
inputs, targets = next(train_loader)
with autocast(device_type="cuda", dtype=torch.bfloat16):
loss = model(inputs, targets, get_window_size_blocks(step))
loss.backward()
# set optimization hyperparameters
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["initial_lr"] * get_lr(step)
frac = min(step / 300, 1)
for group in optimizer2.param_groups:
group["momentum"] = (1 - frac) * 0.85 + frac * 0.95
# step the optimizers and schedulers
for opt in optimizers:
opt.step()
# null the gradients
model.zero_grad(set_to_none=True)
# logging
approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0)
print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True)
print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB "
f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True)
dist.destroy_process_group()
====================================================================================================
Running Python 3.12.3 (main, Feb 4 2025, 14:48:35) [GCC 13.3.0]
Running PyTorch 2.7.0a0+79aa17489c.nv25.04 compiled for CUDA 12.9
Fri May 30 12:25:55 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05 Driver Version: 550.127.05 CUDA Version: 12.9 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H100 80GB HBM3 On | 00000000:04:00.0 Off | 0 |
| N/A 44C P0 129W / 700W | 5856MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA H100 80GB HBM3 On | 00000000:05:00.0 Off | 0 |
| N/A 39C P0 126W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA H100 80GB HBM3 On | 00000000:0B:00.0 Off | 0 |
| N/A 45C P0 132W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA H100 80GB HBM3 On | 00000000:0C:00.0 Off | 0 |
| N/A 38C P0 124W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 4 NVIDIA H100 80GB HBM3 On | 00000000:84:00.0 Off | 0 |
| N/A 44C P0 139W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 5 NVIDIA H100 80GB HBM3 On | 00000000:85:00.0 Off | 0 |
| N/A 37C P0 117W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 6 NVIDIA H100 80GB HBM3 On | 00000000:8B:00.0 Off | 0 |
| N/A 41C P0 119W / 700W | 1518MiB / 81559MiB | 1% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 7 NVIDIA H100 80GB HBM3 On | 00000000:8C:00.0 Off | 0 |
| N/A 38C P0 117W / 700W | 1518MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+
====================================================================================================
scalars: torch.Size([64]) torch.float32 1.0 5.0
embed.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0
value_embeds.0.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0
value_embeds.1.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0
value_embeds.2.weight: torch.Size([50304, 768]) torch.bfloat16 1.0 75.0
blocks.0.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.0.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.0.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.0.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.1.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.1.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.1.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.1.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.2.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.2.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.2.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.2.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.3.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.3.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.3.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.3.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.4.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.4.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.4.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.4.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.5.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.5.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.5.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.5.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.6.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.6.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.6.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.6.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.7.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.7.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.8.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.8.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.8.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.8.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.9.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.9.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.9.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.9.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.10.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.10.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.10.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.10.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
blocks.11.attn.qkv_w: torch.Size([3, 768, 768]) torch.float32 1.0 1.0
blocks.11.attn.c_proj.weight: torch.Size([768, 768]) torch.float32 1.0 1.0
blocks.11.mlp.c_fc.weight: torch.Size([3072, 768]) torch.float32 1.0 1.0
blocks.11.mlp.c_proj.weight: torch.Size([768, 3072]) torch.float32 1.0 1.0
lm_head.weight: torch.Size([50304, 768]) torch.float32 1.0 27.5
Model parameters:
Total parameters: 275,742,784
Embedding parameters: 154,533,888
Non-embedding parameters: 121,208,896
step:0/1770 val_loss:10.8258 train_time:0ms step_avg:0.03ms
step:1/1770 train_time:157ms step_avg:157.17ms
step:2/1770 train_time:170ms step_avg:84.81ms
step:3/1770 train_time:178ms step_avg:59.47ms
step:4/1770 train_time:187ms step_avg:46.80ms
step:5/1770 train_time:260ms step_avg:52.10ms
step:6/1770 train_time:353ms step_avg:58.84ms
step:7/1770 train_time:446ms step_avg:63.76ms
step:8/1770 train_time:539ms step_avg:67.37ms
step:9/1770 train_time:633ms step_avg:70.32ms
step:10/1770 train_time:726ms step_avg:72.56ms
step:11/1770 train_time:818ms step_avg:74.41ms
step:12/1770 train_time:912ms step_avg:75.96ms
step:13/1770 train_time:1005ms step_avg:77.32ms
step:14/1770 train_time:1102ms step_avg:78.72ms
step:15/1770 train_time:1199ms step_avg:79.96ms
step:16/1770 train_time:1295ms step_avg:80.92ms
step:17/1770 train_time:1389ms step_avg:81.69ms
step:18/1770 train_time:1483ms step_avg:82.41ms
step:19/1770 train_time:1577ms step_avg:82.99ms
step:20/1770 train_time:1670ms step_avg:83.52ms
step:21/1770 train_time:1764ms step_avg:84.00ms
step:22/1770 train_time:1857ms step_avg:84.41ms
step:23/1770 train_time:1951ms step_avg:84.81ms
step:24/1770 train_time:2046ms step_avg:85.25ms
step:25/1770 train_time:2142ms step_avg:85.67ms
step:26/1770 train_time:2238ms step_avg:86.06ms
step:27/1770 train_time:2333ms step_avg:86.42ms
step:28/1770 train_time:2428ms step_avg:86.71ms
step:29/1770 train_time:2522ms step_avg:86.97ms
step:30/1770 train_time:2616ms step_avg:87.20ms
step:31/1770 train_time:2709ms step_avg:87.40ms
step:32/1770 train_time:2803ms step_avg:87.60ms
step:33/1770 train_time:2897ms step_avg:87.78ms
step:34/1770 train_time:2992ms step_avg:87.99ms
step:35/1770 train_time:3086ms step_avg:88.18ms
step:36/1770 train_time:3181ms step_avg:88.35ms
step:37/1770 train_time:3276ms step_avg:88.55ms
step:38/1770 train_time:3372ms step_avg:88.73ms
step:39/1770 train_time:3466ms step_avg:88.88ms
step:40/1770 train_time:3560ms step_avg:89.01ms
step:41/1770 train_time:3654ms step_avg:89.13ms
step:42/1770 train_time:3748ms step_avg:89.25ms
step:43/1770 train_time:3841ms step_avg:89.33ms
step:44/1770 train_time:3936ms step_avg:89.46ms
step:45/1770 train_time:4030ms step_avg:89.57ms
step:46/1770 train_time:4125ms step_avg:89.68ms
step:47/1770 train_time:4219ms step_avg:89.77ms
step:48/1770 train_time:4315ms step_avg:89.89ms
step:49/1770 train_time:4411ms step_avg:90.01ms
step:50/1770 train_time:4506ms step_avg:90.12ms
step:51/1770 train_time:4599ms step_avg:90.17ms
step:52/1770 train_time:4693ms step_avg:90.26ms
step:53/1770 train_time:4788ms step_avg:90.33ms
step:54/1770 train_time:4882ms step_avg:90.41ms
step:55/1770 train_time:4976ms step_avg:90.48ms
step:56/1770 train_time:5072ms step_avg:90.57ms
step:57/1770 train_time:5166ms step_avg:90.63ms
step:58/1770 train_time:5260ms step_avg:90.69ms
step:59/1770 train_time:5355ms step_avg:90.77ms
step:60/1770 train_time:5451ms step_avg:90.85ms
step:61/1770 train_time:5545ms step_avg:90.91ms
step:62/1770 train_time:5639ms step_avg:90.95ms
step:63/1770 train_time:5733ms step_avg:91.01ms
step:64/1770 train_time:5827ms step_avg:91.05ms
step:65/1770 train_time:5921ms step_avg:91.09ms
step:66/1770 train_time:6016ms step_avg:91.15ms
step:67/1770 train_time:6110ms step_avg:91.20ms
step:68/1770 train_time:6204ms step_avg:91.24ms
step:69/1770 train_time:6299ms step_avg:91.28ms
step:70/1770 train_time:6394ms step_avg:91.34ms
step:71/1770 train_time:6489ms step_avg:91.40ms
step:72/1770 train_time:6583ms step_avg:91.43ms
step:73/1770 train_time:6677ms step_avg:91.47ms
step:74/1770 train_time:6772ms step_avg:91.51ms
step:75/1770 train_time:6866ms step_avg:91.55ms
step:76/1770 train_time:6960ms step_avg:91.58ms
step:77/1770 train_time:7056ms step_avg:91.63ms
step:78/1770 train_time:7151ms step_avg:91.67ms
step:79/1770 train_time:7244ms step_avg:91.70ms
step:80/1770 train_time:7338ms step_avg:91.72ms
step:81/1770 train_time:7434ms step_avg:91.77ms
step:82/1770 train_time:7528ms step_avg:91.80ms
step:83/1770 train_time:7621ms step_avg:91.82ms
step:84/1770 train_time:7717ms step_avg:91.87ms
step:85/1770 train_time:7812ms step_avg:91.91ms
step:86/1770 train_time:7907ms step_avg:91.94ms
step:87/1770 train_time:8001ms step_avg:91.96ms
step:88/1770 train_time:8095ms step_avg:91.99ms
step:89/1770 train_time:8191ms step_avg:92.03ms
step:90/1770 train_time:8284ms step_avg:92.05ms
step:91/1770 train_time:8379ms step_avg:92.08ms
step:92/1770 train_time:8474ms step_avg:92.11ms
step:93/1770 train_time:8568ms step_avg:92.13ms
step:94/1770 train_time:8663ms step_avg:92.16ms
step:95/1770 train_time:8757ms step_avg:92.18ms
step:96/1770 train_time:8851ms step_avg:92.20ms
step:97/1770 train_time:8944ms step_avg:92.21ms
step:98/1770 train_time:9038ms step_avg:92.23ms
step:99/1770 train_time:9134ms step_avg:92.26ms
step:100/1770 train_time:9228ms step_avg:92.28ms
step:101/1770 train_time:9322ms step_avg:92.30ms
step:102/1770 train_time:9416ms step_avg:92.32ms
step:103/1770 train_time:9511ms step_avg:92.34ms
step:104/1770 train_time:9605ms step_avg:92.36ms
step:105/1770 train_time:9699ms step_avg:92.37ms
step:106/1770 train_time:9794ms step_avg:92.39ms
step:107/1770 train_time:9887ms step_avg:92.41ms
step:108/1770 train_time:9982ms step_avg:92.42ms
step:109/1770 train_time:10076ms step_avg:92.44ms
step:110/1770 train_time:10171ms step_avg:92.46ms
step:111/1770 train_time:10266ms step_avg:92.48ms
step:112/1770 train_time:10360ms step_avg:92.50ms
step:113/1770 train_time:10455ms step_avg:92.52ms
step:114/1770 train_time:10549ms step_avg:92.54ms
step:115/1770 train_time:10643ms step_avg:92.55ms
step:116/1770 train_time:10737ms step_avg:92.56ms
step:117/1770 train_time:10832ms step_avg:92.58ms
step:118/1770 train_time:10926ms step_avg:92.60ms
step:119/1770 train_time:11020ms step_avg:92.61ms
step:120/1770 train_time:11115ms step_avg:92.63ms
step:121/1770 train_time:11210ms step_avg:92.64ms
step:122/1770 train_time:11305ms step_avg:92.66ms
step:123/1770 train_time:11399ms step_avg:92.68ms
step:124/1770 train_time:11494ms step_avg:92.69ms
step:125/1770 train_time:11589ms step_avg:92.71ms
step:125/1770 val_loss:4.6449 train_time:11860ms step_avg:94.88ms
step:126/1770 train_time:11962ms step_avg:94.94ms
step:127/1770 train_time:11986ms step_avg:94.38ms
step:128/1770 train_time:12059ms step_avg:94.21ms
step:129/1770 train_time:12100ms step_avg:93.80ms
step:130/1770 train_time:12157ms step_avg:93.52ms
step:131/1770 train_time:12217ms step_avg:93.26ms
step:132/1770 train_time:12258ms step_avg:92.86ms
step:133/1770 train_time:12351ms step_avg:92.86ms
step:134/1770 train_time:12444ms step_avg:92.87ms
step:135/1770 train_time:12537ms step_avg:92.87ms
step:136/1770 train_time:12631ms step_avg:92.88ms
step:137/1770 train_time:12727ms step_avg:92.90ms
step:138/1770 train_time:12826ms step_avg:92.94ms
step:139/1770 train_time:12923ms step_avg:92.97ms
step:140/1770 train_time:13018ms step_avg:92.99ms
step:141/1770 train_time:13114ms step_avg:93.00ms
step:142/1770 train_time:13208ms step_avg:93.02ms
step:143/1770 train_time:13303ms step_avg:93.03ms
step:144/1770 train_time:13397ms step_avg:93.04ms
step:145/1770 train_time:13491ms step_avg:93.04ms
step:146/1770 train_time:13585ms step_avg:93.05ms
step:147/1770 train_time:13680ms step_avg:93.06ms
step:148/1770 train_time:13776ms step_avg:93.08ms
step:149/1770 train_time:13873ms step_avg:93.11ms
step:150/1770 train_time:13971ms step_avg:93.14ms
step:151/1770 train_time:14068ms step_avg:93.17ms
step:152/1770 train_time:14163ms step_avg:93.18ms
step:153/1770 train_time:14257ms step_avg:93.18ms
step:154/1770 train_time:14352ms step_avg:93.19ms
step:155/1770 train_time:14447ms step_avg:93.21ms
step:156/1770 train_time:14541ms step_avg:93.21ms
step:157/1770 train_time:14635ms step_avg:93.22ms
step:158/1770 train_time:14731ms step_avg:93.23ms
step:159/1770 train_time:14828ms step_avg:93.26ms
step:160/1770 train_time:14924ms step_avg:93.27ms
step:161/1770 train_time:15018ms step_avg:93.28ms
step:162/1770 train_time:15113ms step_avg:93.29ms
step:163/1770 train_time:15209ms step_avg:93.31ms
step:164/1770 train_time:15304ms step_avg:93.32ms
step:165/1770 train_time:15399ms step_avg:93.33ms
step:166/1770 train_time:15493ms step_avg:93.33ms
step:167/1770 train_time:15588ms step_avg:93.34ms
step:168/1770 train_time:15683ms step_avg:93.35ms
step:169/1770 train_time:15778ms step_avg:93.36ms
step:170/1770 train_time:15873ms step_avg:93.37ms
step:171/1770 train_time:15971ms step_avg:93.39ms
step:172/1770 train_time:16066ms step_avg:93.41ms
step:173/1770 train_time:16161ms step_avg:93.42ms
step:174/1770 train_time:16256ms step_avg:93.42ms
step:175/1770 train_time:16351ms step_avg:93.43ms
step:176/1770 train_time:16447ms step_avg:93.45ms
step:177/1770 train_time:16541ms step_avg:93.45ms
step:178/1770 train_time:16635ms step_avg:93.46ms
step:179/1770 train_time:16731ms step_avg:93.47ms
step:180/1770 train_time:16828ms step_avg:93.49ms
step:181/1770 train_time:16924ms step_avg:93.50ms
step:182/1770 train_time:17019ms step_avg:93.51ms
step:183/1770 train_time:17114ms step_avg:93.52ms
step:184/1770 train_time:17210ms step_avg:93.53ms
step:185/1770 train_time:17305ms step_avg:93.54ms
step:186/1770 train_time:17399ms step_avg:93.54ms
step:187/1770 train_time:17494ms step_avg:93.55ms
step:188/1770 train_time:17589ms step_avg:93.56ms
step:189/1770 train_time:17684ms step_avg:93.57ms
step:190/1770 train_time:17779ms step_avg:93.57ms
step:191/1770 train_time:17874ms step_avg:93.58ms
step:192/1770 train_time:17970ms step_avg:93.59ms
step:193/1770 train_time:18066ms step_avg:93.61ms
step:194/1770 train_time:18162ms step_avg:93.62ms
step:195/1770 train_time:18257ms step_avg:93.62ms
step:196/1770 train_time:18351ms step_avg:93.63ms
step:197/1770 train_time:18447ms step_avg:93.64ms
step:198/1770 train_time:18542ms step_avg:93.65ms
step:199/1770 train_time:18637ms step_avg:93.65ms
step:200/1770 train_time:18731ms step_avg:93.65ms
step:201/1770 train_time:18827ms step_avg:93.67ms
step:202/1770 train_time:18923ms step_avg:93.68ms
step:203/1770 train_time:19018ms step_avg:93.68ms
step:204/1770 train_time:19113ms step_avg:93.69ms
step:205/1770 train_time:19210ms step_avg:93.71ms
step:206/1770 train_time:19305ms step_avg:93.71ms
step:207/1770 train_time:19400ms step_avg:93.72ms
step:208/1770 train_time:19495ms step_avg:93.72ms
step:209/1770 train_time:19590ms step_avg:93.73ms
step:210/1770 train_time:19685ms step_avg:93.74ms
step:211/1770 train_time:19779ms step_avg:93.74ms
step:212/1770 train_time:19874ms step_avg:93.75ms
step:213/1770 train_time:19970ms step_avg:93.76ms
step:214/1770 train_time:20067ms step_avg:93.77ms
step:215/1770 train_time:20163ms step_avg:93.78ms
step:216/1770 train_time:20258ms step_avg:93.79ms
step:217/1770 train_time:20352ms step_avg:93.79ms
step:218/1770 train_time:20449ms step_avg:93.80ms
step:219/1770 train_time:20544ms step_avg:93.81ms
step:220/1770 train_time:20640ms step_avg:93.82ms
step:221/1770 train_time:20735ms step_avg:93.82ms
step:222/1770 train_time:20831ms step_avg:93.83ms
step:223/1770 train_time:20926ms step_avg:93.84ms
step:224/1770 train_time:21022ms step_avg:93.85ms
step:225/1770 train_time:21117ms step_avg:93.85ms
step:226/1770 train_time:21212ms step_avg:93.86ms
step:227/1770 train_time:21309ms step_avg:93.87ms
step:228/1770 train_time:21404ms step_avg:93.88ms
step:229/1770 train_time:21498ms step_avg:93.88ms
step:230/1770 train_time:21593ms step_avg:93.88ms
step:231/1770 train_time:21688ms step_avg:93.89ms
step:232/1770 train_time:21784ms step_avg:93.89ms
step:233/1770 train_time:21878ms step_avg:93.90ms
step:234/1770 train_time:21974ms step_avg:93.91ms
step:235/1770 train_time:22070ms step_avg:93.92ms
step:236/1770 train_time:22166ms step_avg:93.92ms
step:237/1770 train_time:22261ms step_avg:93.93ms
step:238/1770 train_time:22356ms step_avg:93.93ms
step:239/1770 train_time:22450ms step_avg:93.93ms
step:240/1770 train_time:22546ms step_avg:93.94ms
step:241/1770 train_time:22640ms step_avg:93.94ms
step:242/1770 train_time:22735ms step_avg:93.95ms
step:243/1770 train_time:22831ms step_avg:93.96ms
step:244/1770 train_time:22928ms step_avg:93.97ms
step:245/1770 train_time:23024ms step_avg:93.98ms
step:246/1770 train_time:23119ms step_avg:93.98ms
step:247/1770 train_time:23214ms step_avg:93.98ms
step:248/1770 train_time:23311ms step_avg:93.99ms
step:249/1770 train_time:23407ms step_avg:94.00ms
step:250/1770 train_time:23502ms step_avg:94.01ms
step:250/1770 val_loss:4.1038 train_time:23775ms step_avg:95.10ms
step:251/1770 train_time:23786ms step_avg:94.76ms
step:252/1770 train_time:23795ms step_avg:94.42ms
step:253/1770 train_time:23804ms step_avg:94.09ms
step:254/1770 train_time:23891ms step_avg:94.06ms
step:255/1770 train_time:23988ms step_avg:94.07ms
step:256/1770 train_time:24085ms step_avg:94.08ms
step:257/1770 train_time:24182ms step_avg:94.09ms
step:258/1770 train_time:24277ms step_avg:94.10ms
step:259/1770 train_time:24370ms step_avg:94.09ms
step:260/1770 train_time:24465ms step_avg:94.09ms
step:261/1770 train_time:24560ms step_avg:94.10ms
step:262/1770 train_time:24653ms step_avg:94.10ms
step:263/1770 train_time:24749ms step_avg:94.10ms
step:264/1770 train_time:24847ms step_avg:94.12ms
step:265/1770 train_time:24945ms step_avg:94.13ms
step:266/1770 train_time:25042ms step_avg:94.14ms
step:267/1770 train_time:25139ms step_avg:94.15ms
step:268/1770 train_time:25235ms step_avg:94.16ms
step:269/1770 train_time:25331ms step_avg:94.17ms
step:270/1770 train_time:25425ms step_avg:94.17ms
step:271/1770 train_time:25520ms step_avg:94.17ms
step:272/1770 train_time:25616ms step_avg:94.18ms
step:273/1770 train_time:25711ms step_avg:94.18ms
step:274/1770 train_time:25807ms step_avg:94.19ms
step:275/1770 train_time:25906ms step_avg:94.20ms
step:276/1770 train_time:26004ms step_avg:94.22ms
step:277/1770 train_time:26100ms step_avg:94.23ms
step:278/1770 train_time:26197ms step_avg:94.23ms
step:279/1770 train_time:26292ms step_avg:94.24ms
step:280/1770 train_time:26388ms step_avg:94.24ms
step:281/1770 train_time:26484ms step_avg:94.25ms
step:282/1770 train_time:26581ms step_avg:94.26ms
step:283/1770 train_time:26677ms step_avg:94.26ms
step:284/1770 train_time:26773ms step_avg:94.27ms
step:285/1770 train_time:26870ms step_avg:94.28ms
step:286/1770 train_time:26967ms step_avg:94.29ms
step:287/1770 train_time:27064ms step_avg:94.30ms
step:288/1770 train_time:27160ms step_avg:94.31ms
step:289/1770 train_time:27257ms step_avg:94.32ms
step:290/1770 train_time:27352ms step_avg:94.32ms
step:291/1770 train_time:27448ms step_avg:94.32ms
step:292/1770 train_time:27544ms step_avg:94.33ms
step:293/1770 train_time:27640ms step_avg:94.33ms
step:294/1770 train_time:27736ms step_avg:94.34ms
step:295/1770 train_time:27832ms step_avg:94.35ms
step:296/1770 train_time:27928ms step_avg:94.35ms
step:297/1770 train_time:28025ms step_avg:94.36ms
step:298/1770 train_time:28122ms step_avg:94.37ms
step:299/1770 train_time:28219ms step_avg:94.38ms
step:300/1770 train_time:28314ms step_avg:94.38ms
step:301/1770 train_time:28409ms step_avg:94.38ms
step:302/1770 train_time:28505ms step_avg:94.39ms
step:303/1770 train_time:28602ms step_avg:94.40ms
step:304/1770 train_time:28697ms step_avg:94.40ms
step:305/1770 train_time:28793ms step_avg:94.40ms
step:306/1770 train_time:28888ms step_avg:94.41ms
step:307/1770 train_time:28985ms step_avg:94.41ms
step:308/1770 train_time:29082ms step_avg:94.42ms
step:309/1770 train_time:29180ms step_avg:94.43ms
step:310/1770 train_time:29277ms step_avg:94.44ms
step:311/1770 train_time:29372ms step_avg:94.44ms
step:312/1770 train_time:29468ms step_avg:94.45ms
step:313/1770 train_time:29564ms step_avg:94.45ms
step:314/1770 train_time:29661ms step_avg:94.46ms
step:315/1770 train_time:29757ms step_avg:94.47ms
step:316/1770 train_time:29853ms step_avg:94.47ms
step:317/1770 train_time:29949ms step_avg:94.48ms
step:318/1770 train_time:30045ms step_avg:94.48ms
step:319/1770 train_time:30142ms step_avg:94.49ms
step:320/1770 train_time:30238ms step_avg:94.49ms
step:321/1770 train_time:30333ms step_avg:94.50ms
step:322/1770 train_time:30429ms step_avg:94.50ms
step:323/1770 train_time:30525ms step_avg:94.51ms
step:324/1770 train_time:30622ms step_avg:94.51ms
step:325/1770 train_time:30718ms step_avg:94.52ms
step:326/1770 train_time:30813ms step_avg:94.52ms
step:327/1770 train_time:30909ms step_avg:94.52ms
step:328/1770 train_time:31006ms step_avg:94.53ms
step:329/1770 train_time:31103ms step_avg:94.54ms
step:330/1770 train_time:31200ms step_avg:94.54ms
step:331/1770 train_time:31296ms step_avg:94.55ms
step:332/1770 train_time:31391ms step_avg:94.55ms
step:333/1770 train_time:31487ms step_avg:94.55ms
step:334/1770 train_time:31583ms step_avg:94.56ms
step:335/1770 train_time:31679ms step_avg:94.56ms
step:336/1770 train_time:31774ms step_avg:94.57ms
step:337/1770 train_time:31870ms step_avg:94.57ms
step:338/1770 train_time:31966ms step_avg:94.57ms
step:339/1770 train_time:32063ms step_avg:94.58ms
step:340/1770 train_time:32159ms step_avg:94.59ms
step:341/1770 train_time:32255ms step_avg:94.59ms
step:342/1770 train_time:32351ms step_avg:94.59ms
step:343/1770 train_time:32447ms step_avg:94.60ms
step:344/1770 train_time:32544ms step_avg:94.61ms
step:345/1770 train_time:32641ms step_avg:94.61ms
step:346/1770 train_time:32738ms step_avg:94.62ms
step:347/1770 train_time:32833ms step_avg:94.62ms
step:348/1770 train_time:32930ms step_avg:94.63ms
step:349/1770 train_time:33026ms step_avg:94.63ms
step:350/1770 train_time:33124ms step_avg:94.64ms
step:351/1770 train_time:33221ms step_avg:94.65ms
step:352/1770 train_time:33318ms step_avg:94.65ms
step:353/1770 train_time:33414ms step_avg:94.66ms
step:354/1770 train_time:33509ms step_avg:94.66ms
step:355/1770 train_time:33606ms step_avg:94.66ms
step:356/1770 train_time:33703ms step_avg:94.67ms
step:357/1770 train_time:33799ms step_avg:94.68ms
step:358/1770 train_time:33895ms step_avg:94.68ms
step:359/1770 train_time:33990ms step_avg:94.68ms
step:360/1770 train_time:34086ms step_avg:94.68ms
step:361/1770 train_time:34183ms step_avg:94.69ms
step:362/1770 train_time:34279ms step_avg:94.69ms
step:363/1770 train_time:34376ms step_avg:94.70ms
step:364/1770 train_time:34471ms step_avg:94.70ms
step:365/1770 train_time:34567ms step_avg:94.70ms
step:366/1770 train_time:34664ms step_avg:94.71ms
step:367/1770 train_time:34761ms step_avg:94.72ms
step:368/1770 train_time:34857ms step_avg:94.72ms
step:369/1770 train_time:34953ms step_avg:94.72ms
step:370/1770 train_time:35048ms step_avg:94.73ms
step:371/1770 train_time:35145ms step_avg:94.73ms
step:372/1770 train_time:35242ms step_avg:94.74ms
step:373/1770 train_time:35338ms step_avg:94.74ms
step:374/1770 train_time:35435ms step_avg:94.74ms
step:375/1770 train_time:35529ms step_avg:94.75ms
step:375/1770 val_loss:3.8967 train_time:35806ms step_avg:95.48ms
step:376/1770 train_time:35817ms step_avg:95.26ms
step:377/1770 train_time:35826ms step_avg:95.03ms
step:378/1770 train_time:35835ms step_avg:94.80ms
step:379/1770 train_time:35919ms step_avg:94.77ms
step:380/1770 train_time:36018ms step_avg:94.78ms
step:381/1770 train_time:36113ms step_avg:94.78ms
step:382/1770 train_time:36208ms step_avg:94.79ms
step:383/1770 train_time:36304ms step_avg:94.79ms
step:384/1770 train_time:36399ms step_avg:94.79ms
step:385/1770 train_time:36494ms step_avg:94.79ms
step:386/1770 train_time:36589ms step_avg:94.79ms
step:387/1770 train_time:36685ms step_avg:94.79ms
step:388/1770 train_time:36784ms step_avg:94.80ms
step:389/1770 train_time:36883ms step_avg:94.81ms
step:390/1770 train_time:36981ms step_avg:94.82ms
step:391/1770 train_time:37078ms step_avg:94.83ms
step:392/1770 train_time:37173ms step_avg:94.83ms
step:393/1770 train_time:37269ms step_avg:94.83ms
step:394/1770 train_time:37365ms step_avg:94.84ms
step:395/1770 train_time:37461ms step_avg:94.84ms
step:396/1770 train_time:37556ms step_avg:94.84ms
step:397/1770 train_time:37652ms step_avg:94.84ms
step:398/1770 train_time:37750ms step_avg:94.85ms
step:399/1770 train_time:37850ms step_avg:94.86ms
step:400/1770 train_time:37949ms step_avg:94.87ms
step:401/1770 train_time:38048ms step_avg:94.88ms
step:402/1770 train_time:38147ms step_avg:94.89ms
step:403/1770 train_time:38247ms step_avg:94.91ms
step:404/1770 train_time:38345ms step_avg:94.91ms
step:405/1770 train_time:38444ms step_avg:94.92ms
step:406/1770 train_time:38542ms step_avg:94.93ms
step:407/1770 train_time:38641ms step_avg:94.94ms
step:408/1770 train_time:38739ms step_avg:94.95ms
step:409/1770 train_time:38838ms step_avg:94.96ms
step:410/1770 train_time:38936ms step_avg:94.96ms
step:411/1770 train_time:39034ms step_avg:94.97ms
step:412/1770 train_time:39132ms step_avg:94.98ms
step:413/1770 train_time:39229ms step_avg:94.99ms
step:414/1770 train_time:39326ms step_avg:94.99ms
step:415/1770 train_time:39425ms step_avg:95.00ms
step:416/1770 train_time:39523ms step_avg:95.01ms
step:417/1770 train_time:39621ms step_avg:95.01ms
step:418/1770 train_time:39719ms step_avg:95.02ms
step:419/1770 train_time:39818ms step_avg:95.03ms
step:420/1770 train_time:39916ms step_avg:95.04ms
step:421/1770 train_time:40015ms step_avg:95.05ms
step:422/1770 train_time:40114ms step_avg:95.06ms
step:423/1770 train_time:40212ms step_avg:95.06ms
step:424/1770 train_time:40310ms step_avg:95.07ms
step:425/1770 train_time:40408ms step_avg:95.08ms
step:426/1770 train_time:40507ms step_avg:95.09ms
step:427/1770 train_time:40605ms step_avg:95.09ms
step:428/1770 train_time:40704ms step_avg:95.10ms
step:429/1770 train_time:40803ms step_avg:95.11ms
step:430/1770 train_time:40901ms step_avg:95.12ms
step:431/1770 train_time:41000ms step_avg:95.13ms
step:432/1770 train_time:41099ms step_avg:95.14ms
step:433/1770 train_time:41196ms step_avg:95.14ms
step:434/1770 train_time:41294ms step_avg:95.15ms
step:435/1770 train_time:41391ms step_avg:95.15ms
step:436/1770 train_time:41489ms step_avg:95.16ms
step:437/1770 train_time:41588ms step_avg:95.17ms
step:438/1770 train_time:41687ms step_avg:95.18ms
step:439/1770 train_time:41786ms step_avg:95.18ms
step:440/1770 train_time:41885ms step_avg:95.19ms
step:441/1770 train_time:41985ms step_avg:95.20ms
step:442/1770 train_time:42084ms step_avg:95.21ms
step:443/1770 train_time:42183ms step_avg:95.22ms
step:444/1770 train_time:42283ms step_avg:95.23ms
step:445/1770 train_time:42383ms step_avg:95.24ms
step:446/1770 train_time:42482ms step_avg:95.25ms
step:447/1770 train_time:42579ms step_avg:95.26ms
step:448/1770 train_time:42677ms step_avg:95.26ms
step:449/1770 train_time:42774ms step_avg:95.27ms
step:450/1770 train_time:42872ms step_avg:95.27ms
step:451/1770 train_time:42970ms step_avg:95.28ms
step:452/1770 train_time:43069ms step_avg:95.28ms
step:453/1770 train_time:43167ms step_avg:95.29ms
step:454/1770 train_time:43267ms step_avg:95.30ms
step:455/1770 train_time:43367ms step_avg:95.31ms
step:456/1770 train_time:43466ms step_avg:95.32ms
step:457/1770 train_time:43565ms step_avg:95.33ms
step:458/1770 train_time:43663ms step_avg:95.33ms
step:459/1770 train_time:43762ms step_avg:95.34ms
step:460/1770 train_time:43861ms step_avg:95.35ms
step:461/1770 train_time:43959ms step_avg:95.36ms
step:462/1770 train_time:44058ms step_avg:95.36ms
step:463/1770 train_time:44157ms step_avg:95.37ms
step:464/1770 train_time:44255ms step_avg:95.38ms
step:465/1770 train_time:44353ms step_avg:95.38ms
step:466/1770 train_time:44451ms step_avg:95.39ms
step:467/1770 train_time:44549ms step_avg:95.39ms
step:468/1770 train_time:44648ms step_avg:95.40ms
step:469/1770 train_time:44745ms step_avg:95.41ms
step:470/1770 train_time:44844ms step_avg:95.41ms
step:471/1770 train_time:44943ms step_avg:95.42ms
step:472/1770 train_time:45042ms step_avg:95.43ms
step:473/1770 train_time:45140ms step_avg:95.43ms
step:474/1770 train_time:45238ms step_avg:95.44ms
step:475/1770 train_time:45338ms step_avg:95.45ms
step:476/1770 train_time:45437ms step_avg:95.46ms
step:477/1770 train_time:45535ms step_avg:95.46ms
step:478/1770 train_time:45632ms step_avg:95.46ms
step:479/1770 train_time:45729ms step_avg:95.47ms
step:480/1770 train_time:45829ms step_avg:95.48ms
step:481/1770 train_time:45928ms step_avg:95.48ms
step:482/1770 train_time:46027ms step_avg:95.49ms
step:483/1770 train_time:46127ms step_avg:95.50ms
step:484/1770 train_time:46226ms step_avg:95.51ms
step:485/1770 train_time:46326ms step_avg:95.52ms
step:486/1770 train_time:46424ms step_avg:95.52ms
step:487/1770 train_time:46523ms step_avg:95.53ms
step:488/1770 train_time:46621ms step_avg:95.54ms
step:489/1770 train_time:46719ms step_avg:95.54ms
step:490/1770 train_time:46817ms step_avg:95.55ms
step:491/1770 train_time:46915ms step_avg:95.55ms
step:492/1770 train_time:47013ms step_avg:95.55ms
step:493/1770 train_time:47111ms step_avg:95.56ms
step:494/1770 train_time:47210ms step_avg:95.57ms
step:495/1770 train_time:47308ms step_avg:95.57ms
step:496/1770 train_time:47407ms step_avg:95.58ms
step:497/1770 train_time:47505ms step_avg:95.58ms
step:498/1770 train_time:47604ms step_avg:95.59ms
step:499/1770 train_time:47703ms step_avg:95.60ms
step:500/1770 train_time:47801ms step_avg:95.60ms
step:500/1770 val_loss:3.7501 train_time:48083ms step_avg:96.17ms
step:501/1770 train_time:48093ms step_avg:95.99ms
step:502/1770 train_time:48102ms step_avg:95.82ms
step:503/1770 train_time:48111ms step_avg:95.65ms
step:504/1770 train_time:48199ms step_avg:95.63ms
step:505/1770 train_time:48296ms step_avg:95.64ms
step:506/1770 train_time:48393ms step_avg:95.64ms
step:507/1770 train_time:48492ms step_avg:95.64ms
step:508/1770 train_time:48591ms step_avg:95.65ms
step:509/1770 train_time:48689ms step_avg:95.66ms
step:510/1770 train_time:48786ms step_avg:95.66ms
step:511/1770 train_time:48884ms step_avg:95.66ms
step:512/1770 train_time:48982ms step_avg:95.67ms
step:513/1770 train_time:49083ms step_avg:95.68ms
step:514/1770 train_time:49183ms step_avg:95.69ms
step:515/1770 train_time:49281ms step_avg:95.69ms
step:516/1770 train_time:49379ms step_avg:95.70ms
step:517/1770 train_time:49476ms step_avg:95.70ms
step:518/1770 train_time:49574ms step_avg:95.70ms
step:519/1770 train_time:49672ms step_avg:95.71ms
step:520/1770 train_time:49769ms step_avg:95.71ms
step:521/1770 train_time:49867ms step_avg:95.71ms
step:522/1770 train_time:49965ms step_avg:95.72ms
step:523/1770 train_time:50064ms step_avg:95.72ms
step:524/1770 train_time:50162ms step_avg:95.73ms
step:525/1770 train_time:50261ms step_avg:95.73ms
step:526/1770 train_time:50359ms step_avg:95.74ms
step:527/1770 train_time:50456ms step_avg:95.74ms
step:528/1770 train_time:50553ms step_avg:95.74ms
step:529/1770 train_time:50651ms step_avg:95.75ms
step:530/1770 train_time:50749ms step_avg:95.75ms
step:531/1770 train_time:50846ms step_avg:95.76ms
step:532/1770 train_time:50945ms step_avg:95.76ms
step:533/1770 train_time:51044ms step_avg:95.77ms
step:534/1770 train_time:51142ms step_avg:95.77ms
step:535/1770 train_time:51242ms step_avg:95.78ms
step:536/1770 train_time:51342ms step_avg:95.79ms
step:537/1770 train_time:51440ms step_avg:95.79ms
step:538/1770 train_time:51538ms step_avg:95.80ms
step:539/1770 train_time:51635ms step_avg:95.80ms
step:540/1770 train_time:51732ms step_avg:95.80ms
step:541/1770 train_time:51830ms step_avg:95.80ms
step:542/1770 train_time:51930ms step_avg:95.81ms
step:543/1770 train_time:52030ms step_avg:95.82ms
step:544/1770 train_time:52130ms step_avg:95.83ms
step:545/1770 train_time:52231ms step_avg:95.84ms
step:546/1770 train_time:52332ms step_avg:95.85ms
step:547/1770 train_time:52432ms step_avg:95.85ms
step:548/1770 train_time:52532ms step_avg:95.86ms
step:549/1770 train_time:52631ms step_avg:95.87ms
step:550/1770 train_time:52728ms step_avg:95.87ms
step:551/1770 train_time:52825ms step_avg:95.87ms
step:552/1770 train_time:52924ms step_avg:95.88ms
step:553/1770 train_time:53021ms step_avg:95.88ms
step:554/1770 train_time:53120ms step_avg:95.88ms
step:555/1770 train_time:53218ms step_avg:95.89ms
step:556/1770 train_time:53317ms step_avg:95.89ms
step:557/1770 train_time:53414ms step_avg:95.90ms
step:558/1770 train_time:53512ms step_avg:95.90ms
step:559/1770 train_time:53612ms step_avg:95.91ms
step:560/1770 train_time:53712ms step_avg:95.91ms
step:561/1770 train_time:53811ms step_avg:95.92ms
step:562/1770 train_time:53909ms step_avg:95.92ms
step:563/1770 train_time:54009ms step_avg:95.93ms
step:564/1770 train_time:54108ms step_avg:95.94ms
step:565/1770 train_time:54208ms step_avg:95.94ms
step:566/1770 train_time:54307ms step_avg:95.95ms
step:567/1770 train_time:54405ms step_avg:95.95ms
step:568/1770 train_time:54505ms step_avg:95.96ms
step:569/1770 train_time:54603ms step_avg:95.96ms
step:570/1770 train_time:54701ms step_avg:95.97ms
step:571/1770 train_time:54798ms step_avg:95.97ms
step:572/1770 train_time:54896ms step_avg:95.97ms
step:573/1770 train_time:54994ms step_avg:95.98ms
step:574/1770 train_time:55093ms step_avg:95.98ms
step:575/1770 train_time:55193ms step_avg:95.99ms
step:576/1770 train_time:55292ms step_avg:95.99ms
step:577/1770 train_time:55393ms step_avg:96.00ms
step:578/1770 train_time:55493ms step_avg:96.01ms
step:579/1770 train_time:55592ms step_avg:96.01ms
step:580/1770 train_time:55690ms step_avg:96.02ms
step:581/1770 train_time:55789ms step_avg:96.02ms
step:582/1770 train_time:55887ms step_avg:96.02ms
step:583/1770 train_time:55985ms step_avg:96.03ms
step:584/1770 train_time:56083ms step_avg:96.03ms
step:585/1770 train_time:56182ms step_avg:96.04ms
step:586/1770 train_time:56280ms step_avg:96.04ms
step:587/1770 train_time:56379ms step_avg:96.05ms
step:588/1770 train_time:56477ms step_avg:96.05ms
step:589/1770 train_time:56575ms step_avg:96.05ms
step:590/1770 train_time:56674ms step_avg:96.06ms
step:591/1770 train_time:56773ms step_avg:96.06ms
step:592/1770 train_time:56872ms step_avg:96.07ms
step:593/1770 train_time:56973ms step_avg:96.08ms
step:594/1770 train_time:57072ms step_avg:96.08ms
step:595/1770 train_time:57170ms step_avg:96.08ms
step:596/1770 train_time:57269ms step_avg:96.09ms
step:597/1770 train_time:57368ms step_avg:96.09ms
step:598/1770 train_time:57468ms step_avg:96.10ms
step:599/1770 train_time:57567ms step_avg:96.11ms
step:600/1770 train_time:57666ms step_avg:96.11ms
step:601/1770 train_time:57765ms step_avg:96.12ms
step:602/1770 train_time:57863ms step_avg:96.12ms
step:603/1770 train_time:57961ms step_avg:96.12ms
step:604/1770 train_time:58059ms step_avg:96.12ms
step:605/1770 train_time:58157ms step_avg:96.13ms
step:606/1770 train_time:58255ms step_avg:96.13ms
step:607/1770 train_time:58355ms step_avg:96.14ms
step:608/1770 train_time:58454ms step_avg:96.14ms
step:609/1770 train_time:58554ms step_avg:96.15ms
step:610/1770 train_time:58652ms step_avg:96.15ms
step:611/1770 train_time:58752ms step_avg:96.16ms
step:612/1770 train_time:58852ms step_avg:96.16ms
step:613/1770 train_time:58951ms step_avg:96.17ms
step:614/1770 train_time:59052ms step_avg:96.18ms
step:615/1770 train_time:59151ms step_avg:96.18ms
step:616/1770 train_time:59252ms step_avg:96.19ms
step:617/1770 train_time:59351ms step_avg:96.19ms
step:618/1770 train_time:59450ms step_avg:96.20ms
step:619/1770 train_time:59549ms step_avg:96.20ms
step:620/1770 train_time:59649ms step_avg:96.21ms
step:621/1770 train_time:59749ms step_avg:96.21ms
step:622/1770 train_time:59849ms step_avg:96.22ms
step:623/1770 train_time:59947ms step_avg:96.22ms
step:624/1770 train_time:60046ms step_avg:96.23ms
step:625/1770 train_time:60144ms step_avg:96.23ms
step:625/1770 val_loss:3.6622 train_time:60426ms step_avg:96.68ms
step:626/1770 train_time:60436ms step_avg:96.54ms
step:627/1770 train_time:60444ms step_avg:96.40ms
step:628/1770 train_time:60452ms step_avg:96.26ms
step:629/1770 train_time:60545ms step_avg:96.26ms
step:630/1770 train_time:60643ms step_avg:96.26ms
step:631/1770 train_time:60742ms step_avg:96.26ms
step:632/1770 train_time:60839ms step_avg:96.26ms
step:633/1770 train_time:60937ms step_avg:96.27ms
step:634/1770 train_time:61035ms step_avg:96.27ms
step:635/1770 train_time:61133ms step_avg:96.27ms
step:636/1770 train_time:61230ms step_avg:96.27ms
step:637/1770 train_time:61329ms step_avg:96.28ms
step:638/1770 train_time:61431ms step_avg:96.29ms
step:639/1770 train_time:61533ms step_avg:96.30ms
step:640/1770 train_time:61633ms step_avg:96.30ms
step:641/1770 train_time:61733ms step_avg:96.31ms
step:642/1770 train_time:61831ms step_avg:96.31ms
step:643/1770 train_time:61929ms step_avg:96.31ms
step:644/1770 train_time:62027ms step_avg:96.32ms
step:645/1770 train_time:62125ms step_avg:96.32ms
step:646/1770 train_time:62222ms step_avg:96.32ms
step:647/1770 train_time:62320ms step_avg:96.32ms
step:648/1770 train_time:62420ms step_avg:96.33ms
step:649/1770 train_time:62520ms step_avg:96.33ms
step:650/1770 train_time:62620ms step_avg:96.34ms
step:651/1770 train_time:62719ms step_avg:96.34ms
step:652/1770 train_time:62818ms step_avg:96.35ms
step:653/1770 train_time:62915ms step_avg:96.35ms
step:654/1770 train_time:63014ms step_avg:96.35ms
step:655/1770 train_time:63114ms step_avg:96.36ms
step:656/1770 train_time:63213ms step_avg:96.36ms
step:657/1770 train_time:63313ms step_avg:96.37ms
step:658/1770 train_time:63411ms step_avg:96.37ms
step:659/1770 train_time:63511ms step_avg:96.38ms
step:660/1770 train_time:63613ms step_avg:96.38ms
step:661/1770 train_time:63715ms step_avg:96.39ms
step:662/1770 train_time:63816ms step_avg:96.40ms
step:663/1770 train_time:63916ms step_avg:96.40ms
step:664/1770 train_time:64016ms step_avg:96.41ms
step:665/1770 train_time:64117ms step_avg:96.42ms
step:666/1770 train_time:64217ms step_avg:96.42ms
step:667/1770 train_time:64318ms step_avg:96.43ms
step:668/1770 train_time:64418ms step_avg:96.43ms
step:669/1770 train_time:64519ms step_avg:96.44ms
step:670/1770 train_time:64620ms step_avg:96.45ms
step:671/1770 train_time:64720ms step_avg:96.45ms
step:672/1770 train_time:64821ms step_avg:96.46ms
step:673/1770 train_time:64921ms step_avg:96.46ms
step:674/1770 train_time:65020ms step_avg:96.47ms
step:675/1770 train_time:65120ms step_avg:96.47ms
step:676/1770 train_time:65220ms step_avg:96.48ms
step:677/1770 train_time:65320ms step_avg:96.48ms
step:678/1770 train_time:65421ms step_avg:96.49ms
step:679/1770 train_time:65521ms step_avg:96.50ms
step:680/1770 train_time:65621ms step_avg:96.50ms
step:681/1770 train_time:65721ms step_avg:96.51ms
step:682/1770 train_time:65821ms step_avg:96.51ms
step:683/1770 train_time:65920ms step_avg:96.52ms
step:684/1770 train_time:66020ms step_avg:96.52ms
step:685/1770 train_time:66121ms step_avg:96.53ms
step:686/1770 train_time:66220ms step_avg:96.53ms
step:687/1770 train_time:66320ms step_avg:96.54ms
step:688/1770 train_time:66420ms step_avg:96.54ms
step:689/1770 train_time:66521ms step_avg:96.55ms
step:690/1770 train_time:66621ms step_avg:96.55ms
step:691/1770 train_time:66721ms step_avg:96.56ms
step:692/1770 train_time:66821ms step_avg:96.56ms
step:693/1770 train_time:66920ms step_avg:96.57ms
step:694/1770 train_time:67020ms step_avg:96.57ms
step:695/1770 train_time:67120ms step_avg:96.58ms
step:696/1770 train_time:67222ms step_avg:96.58ms
step:697/1770 train_time:67321ms step_avg:96.59ms
step:698/1770 train_time:67421ms step_avg:96.59ms
step:699/1770 train_time:67521ms step_avg:96.60ms
step:700/1770 train_time:67621ms step_avg:96.60ms
step:701/1770 train_time:67721ms step_avg:96.61ms
step:702/1770 train_time:67820ms step_avg:96.61ms
step:703/1770 train_time:67921ms step_avg:96.62ms
step:704/1770 train_time:68020ms step_avg:96.62ms
step:705/1770 train_time:68121ms step_avg:96.63ms
step:706/1770 train_time:68221ms step_avg:96.63ms
step:707/1770 train_time:68320ms step_avg:96.63ms
step:708/1770 train_time:68420ms step_avg:96.64ms
step:709/1770 train_time:68520ms step_avg:96.64ms
step:710/1770 train_time:68621ms step_avg:96.65ms
step:711/1770 train_time:68721ms step_avg:96.65ms
step:712/1770 train_time:68821ms step_avg:96.66ms
step:713/1770 train_time:68922ms step_avg:96.66ms
step:714/1770 train_time:69021ms step_avg:96.67ms
step:715/1770 train_time:69121ms step_avg:96.67ms
step:716/1770 train_time:69221ms step_avg:96.68ms
step:717/1770 train_time:69320ms step_avg:96.68ms
step:718/1770 train_time:69421ms step_avg:96.69ms
step:719/1770 train_time:69520ms step_avg:96.69ms
step:720/1770 train_time:69620ms step_avg:96.69ms
step:721/1770 train_time:69720ms step_avg:96.70ms
step:722/1770 train_time:69820ms step_avg:96.70ms
step:723/1770 train_time:69920ms step_avg:96.71ms
step:724/1770 train_time:70021ms step_avg:96.71ms
step:725/1770 train_time:70121ms step_avg:96.72ms
step:726/1770 train_time:70221ms step_avg:96.72ms
step:727/1770 train_time:70321ms step_avg:96.73ms
step:728/1770 train_time:70420ms step_avg:96.73ms
step:729/1770 train_time:70520ms step_avg:96.74ms
step:730/1770 train_time:70621ms step_avg:96.74ms
step:731/1770 train_time:70721ms step_avg:96.74ms
step:732/1770 train_time:70821ms step_avg:96.75ms
step:733/1770 train_time:70921ms step_avg:96.75ms
step:734/1770 train_time:71021ms step_avg:96.76ms
step:735/1770 train_time:71121ms step_avg:96.76ms
step:736/1770 train_time:71221ms step_avg:96.77ms
step:737/1770 train_time:71321ms step_avg:96.77ms
step:738/1770 train_time:71420ms step_avg:96.78ms
step:739/1770 train_time:71520ms step_avg:96.78ms
step:740/1770 train_time:71620ms step_avg:96.78ms
step:741/1770 train_time:71720ms step_avg:96.79ms
step:742/1770 train_time:71820ms step_avg:96.79ms
step:743/1770 train_time:71919ms step_avg:96.80ms
step:744/1770 train_time:72020ms step_avg:96.80ms
step:745/1770 train_time:72120ms step_avg:96.81ms
step:746/1770 train_time:72221ms step_avg:96.81ms
step:747/1770 train_time:72320ms step_avg:96.81ms
step:748/1770 train_time:72420ms step_avg:96.82ms
step:749/1770 train_time:72520ms step_avg:96.82ms
step:750/1770 train_time:72620ms step_avg:96.83ms
step:750/1770 val_loss:3.5996 train_time:72907ms step_avg:97.21ms
step:751/1770 train_time:72916ms step_avg:97.09ms
step:752/1770 train_time:72925ms step_avg:96.97ms
step:753/1770 train_time:72934ms step_avg:96.86ms
step:754/1770 train_time:73027ms step_avg:96.85ms
step:755/1770 train_time:73126ms step_avg:96.86ms
step:756/1770 train_time:73225ms step_avg:96.86ms
step:757/1770 train_time:73325ms step_avg:96.86ms
step:758/1770 train_time:73425ms step_avg:96.87ms
step:759/1770 train_time:73525ms step_avg:96.87ms
step:760/1770 train_time:73625ms step_avg:96.87ms
step:761/1770 train_time:73724ms step_avg:96.88ms
step:762/1770 train_time:73825ms step_avg:96.88ms
step:763/1770 train_time:73928ms step_avg:96.89ms
step:764/1770 train_time:74030ms step_avg:96.90ms
step:765/1770 train_time:74130ms step_avg:96.90ms
step:766/1770 train_time:74230ms step_avg:96.91ms
step:767/1770 train_time:74330ms step_avg:96.91ms
step:768/1770 train_time:74429ms step_avg:96.91ms
step:769/1770 train_time:74528ms step_avg:96.92ms
step:770/1770 train_time:74627ms step_avg:96.92ms
step:771/1770 train_time:74726ms step_avg:96.92ms
step:772/1770 train_time:74826ms step_avg:96.92ms
step:773/1770 train_time:74928ms step_avg:96.93ms
step:774/1770 train_time:75029ms step_avg:96.94ms
step:775/1770 train_time:75129ms step_avg:96.94ms
step:776/1770 train_time:75228ms step_avg:96.94ms
step:777/1770 train_time:75328ms step_avg:96.95ms
step:778/1770 train_time:75428ms step_avg:96.95ms
step:779/1770 train_time:75527ms step_avg:96.95ms
step:780/1770 train_time:75627ms step_avg:96.96ms
step:781/1770 train_time:75726ms step_avg:96.96ms
step:782/1770 train_time:75826ms step_avg:96.96ms
step:783/1770 train_time:75927ms step_avg:96.97ms
step:784/1770 train_time:76027ms step_avg:96.97ms
step:785/1770 train_time:76128ms step_avg:96.98ms
step:786/1770 train_time:76228ms step_avg:96.98ms
step:787/1770 train_time:76328ms step_avg:96.99ms
step:788/1770 train_time:76428ms step_avg:96.99ms
step:789/1770 train_time:76527ms step_avg:96.99ms
step:790/1770 train_time:76627ms step_avg:97.00ms
step:791/1770 train_time:76727ms step_avg:97.00ms
step:792/1770 train_time:76827ms step_avg:97.00ms
step:793/1770 train_time:76928ms step_avg:97.01ms
step:794/1770 train_time:77029ms step_avg:97.01ms
step:795/1770 train_time:77129ms step_avg:97.02ms
step:796/1770 train_time:77230ms step_avg:97.02ms
step:797/1770 train_time:77330ms step_avg:97.03ms
step:798/1770 train_time:77429ms step_avg:97.03ms
step:799/1770 train_time:77529ms step_avg:97.03ms
step:800/1770 train_time:77628ms step_avg:97.04ms
step:801/1770 train_time:77728ms step_avg:97.04ms
step:802/1770 train_time:77828ms step_avg:97.04ms
step:803/1770 train_time:77929ms step_avg:97.05ms
step:804/1770 train_time:78030ms step_avg:97.05ms
step:805/1770 train_time:78131ms step_avg:97.06ms
step:806/1770 train_time:78230ms step_avg:97.06ms
step:807/1770 train_time:78331ms step_avg:97.06ms
step:808/1770 train_time:78430ms step_avg:97.07ms
step:809/1770 train_time:78530ms step_avg:97.07ms
step:810/1770 train_time:78630ms step_avg:97.07ms
step:811/1770 train_time:78729ms step_avg:97.08ms
step:812/1770 train_time:78830ms step_avg:97.08ms
step:813/1770 train_time:78929ms step_avg:97.08ms
step:814/1770 train_time:79029ms step_avg:97.09ms
step:815/1770 train_time:79129ms step_avg:97.09ms
step:816/1770 train_time:79228ms step_avg:97.09ms
step:817/1770 train_time:79329ms step_avg:97.10ms
step:818/1770 train_time:79429ms step_avg:97.10ms
step:819/1770 train_time:79529ms step_avg:97.11ms
step:820/1770 train_time:79630ms step_avg:97.11ms
step:821/1770 train_time:79730ms step_avg:97.11ms
step:822/1770 train_time:79829ms step_avg:97.12ms
step:823/1770 train_time:79928ms step_avg:97.12ms
step:824/1770 train_time:80027ms step_avg:97.12ms
step:825/1770 train_time:80127ms step_avg:97.12ms
step:826/1770 train_time:80226ms step_avg:97.13ms
step:827/1770 train_time:80327ms step_avg:97.13ms
step:828/1770 train_time:80428ms step_avg:97.13ms
step:829/1770 train_time:80528ms step_avg:97.14ms
step:830/1770 train_time:80628ms step_avg:97.14ms
step:831/1770 train_time:80729ms step_avg:97.15ms
step:832/1770 train_time:80829ms step_avg:97.15ms
step:833/1770 train_time:80928ms step_avg:97.15ms
step:834/1770 train_time:81028ms step_avg:97.16ms
step:835/1770 train_time:81129ms step_avg:97.16ms
step:836/1770 train_time:81229ms step_avg:97.16ms
step:837/1770 train_time:81328ms step_avg:97.17ms
step:838/1770 train_time:81429ms step_avg:97.17ms
step:839/1770 train_time:81530ms step_avg:97.17ms
step:840/1770 train_time:81629ms step_avg:97.18ms
step:841/1770 train_time:81730ms step_avg:97.18ms
step:842/1770 train_time:81829ms step_avg:97.18ms
step:843/1770 train_time:81930ms step_avg:97.19ms
step:844/1770 train_time:82028ms step_avg:97.19ms
step:845/1770 train_time:82128ms step_avg:97.19ms
step:846/1770 train_time:82227ms step_avg:97.20ms
step:847/1770 train_time:82327ms step_avg:97.20ms
step:848/1770 train_time:82428ms step_avg:97.20ms
step:849/1770 train_time:82528ms step_avg:97.21ms
step:850/1770 train_time:82629ms step_avg:97.21ms
step:851/1770 train_time:82729ms step_avg:97.21ms
step:852/1770 train_time:82829ms step_avg:97.22ms
step:853/1770 train_time:82929ms step_avg:97.22ms
step:854/1770 train_time:83029ms step_avg:97.22ms
step:855/1770 train_time:83128ms step_avg:97.23ms
step:856/1770 train_time:83228ms step_avg:97.23ms
step:857/1770 train_time:83328ms step_avg:97.23ms
step:858/1770 train_time:83428ms step_avg:97.24ms
step:859/1770 train_time:83528ms step_avg:97.24ms
step:860/1770 train_time:83628ms step_avg:97.24ms
step:861/1770 train_time:83728ms step_avg:97.25ms
step:862/1770 train_time:83828ms step_avg:97.25ms
step:863/1770 train_time:83928ms step_avg:97.25ms
step:864/1770 train_time:84028ms step_avg:97.25ms
step:865/1770 train_time:84129ms step_avg:97.26ms
step:866/1770 train_time:84228ms step_avg:97.26ms
step:867/1770 train_time:84328ms step_avg:97.26ms
step:868/1770 train_time:84428ms step_avg:97.27ms
step:869/1770 train_time:84529ms step_avg:97.27ms
step:870/1770 train_time:84629ms step_avg:97.27ms
step:871/1770 train_time:84729ms step_avg:97.28ms
step:872/1770 train_time:84828ms step_avg:97.28ms
step:873/1770 train_time:84929ms step_avg:97.28ms
step:874/1770 train_time:85028ms step_avg:97.29ms
step:875/1770 train_time:85128ms step_avg:97.29ms
step:875/1770 val_loss:3.5489 train_time:85415ms step_avg:97.62ms
step:876/1770 train_time:85425ms step_avg:97.52ms
step:877/1770 train_time:85433ms step_avg:97.41ms
step:878/1770 train_time:85441ms step_avg:97.31ms
step:879/1770 train_time:85532ms step_avg:97.31ms
step:880/1770 train_time:85633ms step_avg:97.31ms
step:881/1770 train_time:85732ms step_avg:97.31ms
step:882/1770 train_time:85831ms step_avg:97.31ms
step:883/1770 train_time:85930ms step_avg:97.32ms
step:884/1770 train_time:86029ms step_avg:97.32ms
step:885/1770 train_time:86129ms step_avg:97.32ms
step:886/1770 train_time:86229ms step_avg:97.32ms
step:887/1770 train_time:86331ms step_avg:97.33ms
step:888/1770 train_time:86435ms step_avg:97.34ms
step:889/1770 train_time:86536ms step_avg:97.34ms
step:890/1770 train_time:86636ms step_avg:97.34ms
step:891/1770 train_time:86736ms step_avg:97.35ms
step:892/1770 train_time:86836ms step_avg:97.35ms
step:893/1770 train_time:86935ms step_avg:97.35ms
step:894/1770 train_time:87034ms step_avg:97.35ms
step:895/1770 train_time:87133ms step_avg:97.36ms
step:896/1770 train_time:87232ms step_avg:97.36ms
step:897/1770 train_time:87333ms step_avg:97.36ms
step:898/1770 train_time:87434ms step_avg:97.36ms
step:899/1770 train_time:87535ms step_avg:97.37ms
step:900/1770 train_time:87636ms step_avg:97.37ms
step:901/1770 train_time:87736ms step_avg:97.38ms
step:902/1770 train_time:87837ms step_avg:97.38ms
step:903/1770 train_time:87936ms step_avg:97.38ms
step:904/1770 train_time:88035ms step_avg:97.38ms
step:905/1770 train_time:88135ms step_avg:97.39ms
step:906/1770 train_time:88236ms step_avg:97.39ms
step:907/1770 train_time:88337ms step_avg:97.39ms
step:908/1770 train_time:88437ms step_avg:97.40ms
step:909/1770 train_time:88538ms step_avg:97.40ms
step:910/1770 train_time:88638ms step_avg:97.40ms
step:911/1770 train_time:88738ms step_avg:97.41ms
step:912/1770 train_time:88838ms step_avg:97.41ms
step:913/1770 train_time:88938ms step_avg:97.41ms
step:914/1770 train_time:89039ms step_avg:97.42ms
step:915/1770 train_time:89138ms step_avg:97.42ms
step:916/1770 train_time:89240ms step_avg:97.42ms
step:917/1770 train_time:89341ms step_avg:97.43ms
step:918/1770 train_time:89441ms step_avg:97.43ms
step:919/1770 train_time:89542ms step_avg:97.43ms
step:920/1770 train_time:89643ms step_avg:97.44ms
step:921/1770 train_time:89744ms step_avg:97.44ms
step:922/1770 train_time:89846ms step_avg:97.45ms
step:923/1770 train_time:89949ms step_avg:97.45ms
step:924/1770 train_time:90051ms step_avg:97.46ms
step:925/1770 train_time:90153ms step_avg:97.46ms
step:926/1770 train_time:90253ms step_avg:97.47ms
step:927/1770 train_time:90354ms step_avg:97.47ms
step:928/1770 train_time:90454ms step_avg:97.47ms
step:929/1770 train_time:90555ms step_avg:97.48ms
step:930/1770 train_time:90656ms step_avg:97.48ms
step:931/1770 train_time:90757ms step_avg:97.48ms
step:932/1770 train_time:90860ms step_avg:97.49ms
step:933/1770 train_time:90961ms step_avg:97.49ms
step:934/1770 train_time:91063ms step_avg:97.50ms
step:935/1770 train_time:91164ms step_avg:97.50ms
step:936/1770 train_time:91268ms step_avg:97.51ms
step:937/1770 train_time:91371ms step_avg:97.51ms
step:938/1770 train_time:91473ms step_avg:97.52ms
step:939/1770 train_time:91574ms step_avg:97.52ms
step:940/1770 train_time:91675ms step_avg:97.53ms
step:941/1770 train_time:91776ms step_avg:97.53ms
step:942/1770 train_time:91877ms step_avg:97.53ms
step:943/1770 train_time:91978ms step_avg:97.54ms
step:944/1770 train_time:92080ms step_avg:97.54ms
step:945/1770 train_time:92182ms step_avg:97.55ms
step:946/1770 train_time:92285ms step_avg:97.55ms
step:947/1770 train_time:92387ms step_avg:97.56ms
step:948/1770 train_time:92491ms step_avg:97.56ms
step:949/1770 train_time:92593ms step_avg:97.57ms
step:950/1770 train_time:92694ms step_avg:97.57ms
step:951/1770 train_time:92794ms step_avg:97.58ms
step:952/1770 train_time:92895ms step_avg:97.58ms
step:953/1770 train_time:92995ms step_avg:97.58ms
step:954/1770 train_time:93097ms step_avg:97.59ms
step:955/1770 train_time:93199ms step_avg:97.59ms
step:956/1770 train_time:93301ms step_avg:97.60ms
step:957/1770 train_time:93405ms step_avg:97.60ms
step:958/1770 train_time:93508ms step_avg:97.61ms
step:959/1770 train_time:93611ms step_avg:97.61ms
step:960/1770 train_time:93712ms step_avg:97.62ms
step:961/1770 train_time:93814ms step_avg:97.62ms
step:962/1770 train_time:93915ms step_avg:97.62ms
step:963/1770 train_time:94015ms step_avg:97.63ms
step:964/1770 train_time:94117ms step_avg:97.63ms
step:965/1770 train_time:94218ms step_avg:97.63ms
step:966/1770 train_time:94321ms step_avg:97.64ms
step:967/1770 train_time:94425ms step_avg:97.65ms
step:968/1770 train_time:94527ms step_avg:97.65ms
step:969/1770 train_time:94631ms step_avg:97.66ms
step:970/1770 train_time:94732ms step_avg:97.66ms
step:971/1770 train_time:94833ms step_avg:97.67ms
step:972/1770 train_time:94934ms step_avg:97.67ms
step:973/1770 train_time:95035ms step_avg:97.67ms
step:974/1770 train_time:95135ms step_avg:97.67ms
step:975/1770 train_time:95237ms step_avg:97.68ms
step:976/1770 train_time:95338ms step_avg:97.68ms
step:977/1770 train_time:95440ms step_avg:97.69ms
step:978/1770 train_time:95544ms step_avg:97.69ms
step:979/1770 train_time:95646ms step_avg:97.70ms
step:980/1770 train_time:95748ms step_avg:97.70ms
step:981/1770 train_time:95851ms step_avg:97.71ms
step:982/1770 train_time:95952ms step_avg:97.71ms
step:983/1770 train_time:96054ms step_avg:97.72ms
step:984/1770 train_time:96154ms step_avg:97.72ms
step:985/1770 train_time:96255ms step_avg:97.72ms
step:986/1770 train_time:96357ms step_avg:97.72ms
step:987/1770 train_time:96458ms step_avg:97.73ms
step:988/1770 train_time:96560ms step_avg:97.73ms
step:989/1770 train_time:96662ms step_avg:97.74ms
step:990/1770 train_time:96764ms step_avg:97.74ms
step:991/1770 train_time:96868ms step_avg:97.75ms
step:992/1770 train_time:96971ms step_avg:97.75ms
step:993/1770 train_time:97073ms step_avg:97.76ms
step:994/1770 train_time:97173ms step_avg:97.76ms
step:995/1770 train_time:97274ms step_avg:97.76ms
step:996/1770 train_time:97374ms step_avg:97.77ms
step:997/1770 train_time:97476ms step_avg:97.77ms
step:998/1770 train_time:97577ms step_avg:97.77ms
step:999/1770 train_time:97679ms step_avg:97.78ms
step:1000/1770 train_time:97781ms step_avg:97.78ms
step:1000/1770 val_loss:3.5120 train_time:98079ms step_avg:98.08ms
step:1001/1770 train_time:98088ms step_avg:97.99ms
step:1002/1770 train_time:98097ms step_avg:97.90ms
step:1003/1770 train_time:98105ms step_avg:97.81ms
step:1004/1770 train_time:98198ms step_avg:97.81ms
step:1005/1770 train_time:98301ms step_avg:97.81ms
step:1006/1770 train_time:98405ms step_avg:97.82ms
step:1007/1770 train_time:98507ms step_avg:97.82ms
step:1008/1770 train_time:98608ms step_avg:97.83ms
step:1009/1770 train_time:98709ms step_avg:97.83ms
step:1010/1770 train_time:98810ms step_avg:97.83ms
step:1011/1770 train_time:98910ms step_avg:97.83ms
step:1012/1770 train_time:99011ms step_avg:97.84ms
step:1013/1770 train_time:99114ms step_avg:97.84ms
step:1014/1770 train_time:99216ms step_avg:97.85ms
step:1015/1770 train_time:99318ms step_avg:97.85ms
step:1016/1770 train_time:99420ms step_avg:97.85ms
step:1017/1770 train_time:99522ms step_avg:97.86ms
step:1018/1770 train_time:99626ms step_avg:97.86ms
step:1019/1770 train_time:99727ms step_avg:97.87ms
step:1020/1770 train_time:99828ms step_avg:97.87ms
step:1021/1770 train_time:99929ms step_avg:97.87ms
step:1022/1770 train_time:100030ms step_avg:97.88ms
step:1023/1770 train_time:100131ms step_avg:97.88ms
step:1024/1770 train_time:100233ms step_avg:97.88ms
step:1025/1770 train_time:100334ms step_avg:97.89ms
step:1026/1770 train_time:100435ms step_avg:97.89ms
step:1027/1770 train_time:100538ms step_avg:97.89ms
step:1028/1770 train_time:100640ms step_avg:97.90ms
step:1029/1770 train_time:100742ms step_avg:97.90ms
step:1030/1770 train_time:100844ms step_avg:97.91ms
step:1031/1770 train_time:100948ms step_avg:97.91ms
step:1032/1770 train_time:101049ms step_avg:97.92ms
step:1033/1770 train_time:101150ms step_avg:97.92ms
step:1034/1770 train_time:101251ms step_avg:97.92ms
step:1035/1770 train_time:101352ms step_avg:97.93ms
step:1036/1770 train_time:101453ms step_avg:97.93ms
step:1037/1770 train_time:101555ms step_avg:97.93ms
step:1038/1770 train_time:101656ms step_avg:97.93ms
step:1039/1770 train_time:101758ms step_avg:97.94ms
step:1040/1770 train_time:101862ms step_avg:97.94ms
step:1041/1770 train_time:101965ms step_avg:97.95ms
step:1042/1770 train_time:102067ms step_avg:97.95ms
step:1043/1770 train_time:102169ms step_avg:97.96ms
step:1044/1770 train_time:102270ms step_avg:97.96ms
step:1045/1770 train_time:102371ms step_avg:97.96ms
step:1046/1770 train_time:102472ms step_avg:97.97ms
step:1047/1770 train_time:102572ms step_avg:97.97ms
step:1048/1770 train_time:102673ms step_avg:97.97ms
step:1049/1770 train_time:102775ms step_avg:97.97ms
step:1050/1770 train_time:102877ms step_avg:97.98ms
step:1051/1770 train_time:102979ms step_avg:97.98ms
step:1052/1770 train_time:103082ms step_avg:97.99ms
step:1053/1770 train_time:103186ms step_avg:97.99ms
step:1054/1770 train_time:103288ms step_avg:98.00ms
step:1055/1770 train_time:103390ms step_avg:98.00ms
step:1056/1770 train_time:103490ms step_avg:98.00ms
step:1057/1770 train_time:103591ms step_avg:98.00ms
step:1058/1770 train_time:103691ms step_avg:98.01ms
step:1059/1770 train_time:103792ms step_avg:98.01ms
step:1060/1770 train_time:103894ms step_avg:98.01ms
step:1061/1770 train_time:103996ms step_avg:98.02ms
step:1062/1770 train_time:104098ms step_avg:98.02ms
step:1063/1770 train_time:104202ms step_avg:98.03ms
step:1064/1770 train_time:104306ms step_avg:98.03ms
step:1065/1770 train_time:104409ms step_avg:98.04ms
step:1066/1770 train_time:104510ms step_avg:98.04ms
step:1067/1770 train_time:104612ms step_avg:98.04ms
step:1068/1770 train_time:104712ms step_avg:98.04ms
step:1069/1770 train_time:104812ms step_avg:98.05ms
step:1070/1770 train_time:104914ms step_avg:98.05ms
step:1071/1770 train_time:105015ms step_avg:98.05ms
step:1072/1770 train_time:105117ms step_avg:98.06ms
step:1073/1770 train_time:105219ms step_avg:98.06ms
step:1074/1770 train_time:105323ms step_avg:98.07ms
step:1075/1770 train_time:105426ms step_avg:98.07ms
step:1076/1770 train_time:105528ms step_avg:98.07ms
step:1077/1770 train_time:105629ms step_avg:98.08ms
step:1078/1770 train_time:105731ms step_avg:98.08ms
step:1079/1770 train_time:105832ms step_avg:98.08ms
step:1080/1770 train_time:105932ms step_avg:98.09ms
step:1081/1770 train_time:106033ms step_avg:98.09ms
step:1082/1770 train_time:106136ms step_avg:98.09ms
step:1083/1770 train_time:106238ms step_avg:98.10ms
step:1084/1770 train_time:106340ms step_avg:98.10ms
step:1085/1770 train_time:106444ms step_avg:98.10ms
step:1086/1770 train_time:106546ms step_avg:98.11ms
step:1087/1770 train_time:106649ms step_avg:98.11ms
step:1088/1770 train_time:106751ms step_avg:98.12ms
step:1089/1770 train_time:106852ms step_avg:98.12ms
step:1090/1770 train_time:106952ms step_avg:98.12ms
step:1091/1770 train_time:107053ms step_avg:98.12ms
step:1092/1770 train_time:107155ms step_avg:98.13ms
step:1093/1770 train_time:107257ms step_avg:98.13ms
step:1094/1770 train_time:107360ms step_avg:98.14ms
step:1095/1770 train_time:107462ms step_avg:98.14ms
step:1096/1770 train_time:107565ms step_avg:98.14ms
step:1097/1770 train_time:107669ms step_avg:98.15ms
step:1098/1770 train_time:107770ms step_avg:98.15ms
step:1099/1770 train_time:107873ms step_avg:98.16ms
step:1100/1770 train_time:107975ms step_avg:98.16ms
step:1101/1770 train_time:108075ms step_avg:98.16ms
step:1102/1770 train_time:108176ms step_avg:98.16ms
step:1103/1770 train_time:108278ms step_avg:98.17ms
step:1104/1770 train_time:108381ms step_avg:98.17ms
step:1105/1770 train_time:108484ms step_avg:98.18ms
step:1106/1770 train_time:108589ms step_avg:98.18ms
step:1107/1770 train_time:108690ms step_avg:98.18ms
step:1108/1770 train_time:108792ms step_avg:98.19ms
step:1109/1770 train_time:108894ms step_avg:98.19ms
step:1110/1770 train_time:108995ms step_avg:98.19ms
step:1111/1770 train_time:109096ms step_avg:98.20ms
step:1112/1770 train_time:109198ms step_avg:98.20ms
step:1113/1770 train_time:109299ms step_avg:98.20ms
step:1114/1770 train_time:109402ms step_avg:98.21ms
step:1115/1770 train_time:109505ms step_avg:98.21ms
step:1116/1770 train_time:109608ms step_avg:98.22ms
step:1117/1770 train_time:109711ms step_avg:98.22ms
step:1118/1770 train_time:109811ms step_avg:98.22ms
step:1119/1770 train_time:109914ms step_avg:98.23ms
step:1120/1770 train_time:110015ms step_avg:98.23ms
step:1121/1770 train_time:110116ms step_avg:98.23ms
step:1122/1770 train_time:110218ms step_avg:98.23ms
step:1123/1770 train_time:110319ms step_avg:98.24ms
step:1124/1770 train_time:110423ms step_avg:98.24ms
step:1125/1770 train_time:110526ms step_avg:98.24ms
step:1125/1770 val_loss:3.4715 train_time:110820ms step_avg:98.51ms
step:1126/1770 train_time:110829ms step_avg:98.43ms
step:1127/1770 train_time:110838ms step_avg:98.35ms
step:1128/1770 train_time:110846ms step_avg:98.27ms
step:1129/1770 train_time:110937ms step_avg:98.26ms
step:1130/1770 train_time:111039ms step_avg:98.26ms
step:1131/1770 train_time:111139ms step_avg:98.27ms
step:1132/1770 train_time:111240ms step_avg:98.27ms
step:1133/1770 train_time:111340ms step_avg:98.27ms
step:1134/1770 train_time:111440ms step_avg:98.27ms
step:1135/1770 train_time:111540ms step_avg:98.27ms
step:1136/1770 train_time:111641ms step_avg:98.28ms
step:1137/1770 train_time:111745ms step_avg:98.28ms
step:1138/1770 train_time:111850ms step_avg:98.29ms
step:1139/1770 train_time:111955ms step_avg:98.29ms
step:1140/1770 train_time:112056ms step_avg:98.29ms
step:1141/1770 train_time:112160ms step_avg:98.30ms
step:1142/1770 train_time:112261ms step_avg:98.30ms
step:1143/1770 train_time:112361ms step_avg:98.30ms
step:1144/1770 train_time:112462ms step_avg:98.31ms
step:1145/1770 train_time:112562ms step_avg:98.31ms
step:1146/1770 train_time:112663ms step_avg:98.31ms
step:1147/1770 train_time:112764ms step_avg:98.31ms
step:1148/1770 train_time:112869ms step_avg:98.32ms
step:1149/1770 train_time:112973ms step_avg:98.32ms
step:1150/1770 train_time:113077ms step_avg:98.33ms
step:1151/1770 train_time:113179ms step_avg:98.33ms
step:1152/1770 train_time:113281ms step_avg:98.33ms
step:1153/1770 train_time:113382ms step_avg:98.34ms
step:1154/1770 train_time:113483ms step_avg:98.34ms
step:1155/1770 train_time:113584ms step_avg:98.34ms
step:1156/1770 train_time:113686ms step_avg:98.34ms
step:1157/1770 train_time:113789ms step_avg:98.35ms
step:1158/1770 train_time:113891ms step_avg:98.35ms
step:1159/1770 train_time:113994ms step_avg:98.36ms
step:1160/1770 train_time:114097ms step_avg:98.36ms
step:1161/1770 train_time:114199ms step_avg:98.36ms
step:1162/1770 train_time:114300ms step_avg:98.36ms
step:1163/1770 train_time:114400ms step_avg:98.37ms
step:1164/1770 train_time:114502ms step_avg:98.37ms
step:1165/1770 train_time:114603ms step_avg:98.37ms
step:1166/1770 train_time:114703ms step_avg:98.37ms
step:1167/1770 train_time:114805ms step_avg:98.38ms
step:1168/1770 train_time:114907ms step_avg:98.38ms
step:1169/1770 train_time:115010ms step_avg:98.38ms
step:1170/1770 train_time:115114ms step_avg:98.39ms
step:1171/1770 train_time:115218ms step_avg:98.39ms
step:1172/1770 train_time:115320ms step_avg:98.40ms
step:1173/1770 train_time:115420ms step_avg:98.40ms
step:1174/1770 train_time:115521ms step_avg:98.40ms
step:1175/1770 train_time:115622ms step_avg:98.40ms
step:1176/1770 train_time:115723ms step_avg:98.40ms
step:1177/1770 train_time:115825ms step_avg:98.41ms
step:1178/1770 train_time:115926ms step_avg:98.41ms
step:1179/1770 train_time:116029ms step_avg:98.41ms
step:1180/1770 train_time:116131ms step_avg:98.42ms
step:1181/1770 train_time:116234ms step_avg:98.42ms
step:1182/1770 train_time:116337ms step_avg:98.42ms
step:1183/1770 train_time:116439ms step_avg:98.43ms
step:1184/1770 train_time:116540ms step_avg:98.43ms
step:1185/1770 train_time:116642ms step_avg:98.43ms
step:1186/1770 train_time:116744ms step_avg:98.43ms
step:1187/1770 train_time:116846ms step_avg:98.44ms
step:1188/1770 train_time:116950ms step_avg:98.44ms
step:1189/1770 train_time:117054ms step_avg:98.45ms
step:1190/1770 train_time:117158ms step_avg:98.45ms
step:1191/1770 train_time:117261ms step_avg:98.46ms
step:1192/1770 train_time:117365ms step_avg:98.46ms
step:1193/1770 train_time:117469ms step_avg:98.46ms
step:1194/1770 train_time:117572ms step_avg:98.47ms
step:1195/1770 train_time:117675ms step_avg:98.47ms
step:1196/1770 train_time:117778ms step_avg:98.48ms
step:1197/1770 train_time:117881ms step_avg:98.48ms
step:1198/1770 train_time:117983ms step_avg:98.48ms
step:1199/1770 train_time:118087ms step_avg:98.49ms
step:1200/1770 train_time:118189ms step_avg:98.49ms
step:1201/1770 train_time:118293ms step_avg:98.50ms
step:1202/1770 train_time:118397ms step_avg:98.50ms
step:1203/1770 train_time:118501ms step_avg:98.50ms
step:1204/1770 train_time:118603ms step_avg:98.51ms
step:1205/1770 train_time:118705ms step_avg:98.51ms
step:1206/1770 train_time:118808ms step_avg:98.51ms
step:1207/1770 train_time:118910ms step_avg:98.52ms
step:1208/1770 train_time:119017ms step_avg:98.52ms
step:1209/1770 train_time:119121ms step_avg:98.53ms
step:1210/1770 train_time:119223ms step_avg:98.53ms
step:1211/1770 train_time:119326ms step_avg:98.54ms
step:1212/1770 train_time:119429ms step_avg:98.54ms
step:1213/1770 train_time:119533ms step_avg:98.54ms
step:1214/1770 train_time:119638ms step_avg:98.55ms
step:1215/1770 train_time:119740ms step_avg:98.55ms
step:1216/1770 train_time:119843ms step_avg:98.56ms
step:1217/1770 train_time:119947ms step_avg:98.56ms
step:1218/1770 train_time:120053ms step_avg:98.57ms
step:1219/1770 train_time:120156ms step_avg:98.57ms
step:1220/1770 train_time:120258ms step_avg:98.57ms
step:1221/1770 train_time:120361ms step_avg:98.58ms
step:1222/1770 train_time:120463ms step_avg:98.58ms
step:1223/1770 train_time:120565ms step_avg:98.58ms
step:1224/1770 train_time:120669ms step_avg:98.59ms
step:1225/1770 train_time:120772ms step_avg:98.59ms
step:1226/1770 train_time:120877ms step_avg:98.59ms
step:1227/1770 train_time:120981ms step_avg:98.60ms
step:1228/1770 train_time:121083ms step_avg:98.60ms
step:1229/1770 train_time:121186ms step_avg:98.61ms
step:1230/1770 train_time:121290ms step_avg:98.61ms
step:1231/1770 train_time:121393ms step_avg:98.61ms
step:1232/1770 train_time:121497ms step_avg:98.62ms
step:1233/1770 train_time:121599ms step_avg:98.62ms
step:1234/1770 train_time:121701ms step_avg:98.62ms
step:1235/1770 train_time:121803ms step_avg:98.63ms
step:1236/1770 train_time:121906ms step_avg:98.63ms
step:1237/1770 train_time:122010ms step_avg:98.63ms
step:1238/1770 train_time:122115ms step_avg:98.64ms
step:1239/1770 train_time:122218ms step_avg:98.64ms
step:1240/1770 train_time:122319ms step_avg:98.64ms
step:1241/1770 train_time:122421ms step_avg:98.65ms
step:1242/1770 train_time:122523ms step_avg:98.65ms
step:1243/1770 train_time:122627ms step_avg:98.65ms
step:1244/1770 train_time:122731ms step_avg:98.66ms
step:1245/1770 train_time:122834ms step_avg:98.66ms
step:1246/1770 train_time:122939ms step_avg:98.67ms
step:1247/1770 train_time:123041ms step_avg:98.67ms
step:1248/1770 train_time:123145ms step_avg:98.67ms
step:1249/1770 train_time:123247ms step_avg:98.68ms
step:1250/1770 train_time:123350ms step_avg:98.68ms
step:1250/1770 val_loss:3.4239 train_time:123648ms step_avg:98.92ms
step:1251/1770 train_time:123658ms step_avg:98.85ms
step:1252/1770 train_time:123667ms step_avg:98.78ms
step:1253/1770 train_time:123675ms step_avg:98.70ms
step:1254/1770 train_time:123766ms step_avg:98.70ms
step:1255/1770 train_time:123868ms step_avg:98.70ms
step:1256/1770 train_time:123970ms step_avg:98.70ms
step:1257/1770 train_time:124074ms step_avg:98.71ms
step:1258/1770 train_time:124177ms step_avg:98.71ms
step:1259/1770 train_time:124280ms step_avg:98.71ms
step:1260/1770 train_time:124383ms step_avg:98.72ms
step:1261/1770 train_time:124485ms step_avg:98.72ms
step:1262/1770 train_time:124592ms step_avg:98.73ms
step:1263/1770 train_time:124699ms step_avg:98.73ms
step:1264/1770 train_time:124802ms step_avg:98.74ms
step:1265/1770 train_time:124904ms step_avg:98.74ms
step:1266/1770 train_time:125007ms step_avg:98.74ms
step:1267/1770 train_time:125109ms step_avg:98.74ms
step:1268/1770 train_time:125212ms step_avg:98.75ms
step:1269/1770 train_time:125315ms step_avg:98.75ms
step:1270/1770 train_time:125419ms step_avg:98.76ms
step:1271/1770 train_time:125522ms step_avg:98.76ms
step:1272/1770 train_time:125625ms step_avg:98.76ms
step:1273/1770 train_time:125729ms step_avg:98.77ms
step:1274/1770 train_time:125833ms step_avg:98.77ms
step:1275/1770 train_time:125938ms step_avg:98.78ms
step:1276/1770 train_time:126040ms step_avg:98.78ms
step:1277/1770 train_time:126142ms step_avg:98.78ms
step:1278/1770 train_time:126244ms step_avg:98.78ms
step:1279/1770 train_time:126346ms step_avg:98.79ms
step:1280/1770 train_time:126450ms step_avg:98.79ms
step:1281/1770 train_time:126555ms step_avg:98.79ms
step:1282/1770 train_time:126659ms step_avg:98.80ms
step:1283/1770 train_time:126762ms step_avg:98.80ms
step:1284/1770 train_time:126866ms step_avg:98.81ms
step:1285/1770 train_time:126969ms step_avg:98.81ms
step:1286/1770 train_time:127072ms step_avg:98.81ms
step:1287/1770 train_time:127175ms step_avg:98.82ms
step:1288/1770 train_time:127279ms step_avg:98.82ms
step:1289/1770 train_time:127382ms step_avg:98.82ms
step:1290/1770 train_time:127483ms step_avg:98.82ms
step:1291/1770 train_time:127586ms step_avg:98.83ms
step:1292/1770 train_time:127690ms step_avg:98.83ms
step:1293/1770 train_time:127794ms step_avg:98.84ms
step:1294/1770 train_time:127899ms step_avg:98.84ms
step:1295/1770 train_time:128002ms step_avg:98.84ms
step:1296/1770 train_time:128104ms step_avg:98.85ms
step:1297/1770 train_time:128207ms step_avg:98.85ms
step:1298/1770 train_time:128310ms step_avg:98.85ms
step:1299/1770 train_time:128412ms step_avg:98.85ms
step:1300/1770 train_time:128516ms step_avg:98.86ms
step:1301/1770 train_time:128620ms step_avg:98.86ms
step:1302/1770 train_time:128723ms step_avg:98.87ms
step:1303/1770 train_time:128826ms step_avg:98.87ms
step:1304/1770 train_time:128930ms step_avg:98.87ms
step:1305/1770 train_time:129033ms step_avg:98.88ms
step:1306/1770 train_time:129136ms step_avg:98.88ms
step:1307/1770 train_time:129240ms step_avg:98.88ms
step:1308/1770 train_time:129341ms step_avg:98.88ms
step:1309/1770 train_time:129443ms step_avg:98.89ms
step:1310/1770 train_time:129545ms step_avg:98.89ms
step:1311/1770 train_time:129648ms step_avg:98.89ms
step:1312/1770 train_time:129751ms step_avg:98.90ms
step:1313/1770 train_time:129856ms step_avg:98.90ms
step:1314/1770 train_time:129961ms step_avg:98.90ms
step:1315/1770 train_time:130063ms step_avg:98.91ms
step:1316/1770 train_time:130166ms step_avg:98.91ms
step:1317/1770 train_time:130268ms step_avg:98.91ms
step:1318/1770 train_time:130371ms step_avg:98.92ms
step:1319/1770 train_time:130475ms step_avg:98.92ms
step:1320/1770 train_time:130580ms step_avg:98.92ms
step:1321/1770 train_time:130681ms step_avg:98.93ms
step:1322/1770 train_time:130785ms step_avg:98.93ms
step:1323/1770 train_time:130889ms step_avg:98.93ms
step:1324/1770 train_time:130994ms step_avg:98.94ms
step:1325/1770 train_time:131098ms step_avg:98.94ms
step:1326/1770 train_time:131200ms step_avg:98.94ms
step:1327/1770 train_time:131303ms step_avg:98.95ms
step:1328/1770 train_time:131405ms step_avg:98.95ms
step:1329/1770 train_time:131512ms step_avg:98.96ms
step:1330/1770 train_time:131614ms step_avg:98.96ms
step:1331/1770 train_time:131718ms step_avg:98.96ms
step:1332/1770 train_time:131822ms step_avg:98.97ms
step:1333/1770 train_time:131924ms step_avg:98.97ms
step:1334/1770 train_time:132028ms step_avg:98.97ms
step:1335/1770 train_time:132131ms step_avg:98.97ms
step:1336/1770 train_time:132234ms step_avg:98.98ms
step:1337/1770 train_time:132338ms step_avg:98.98ms
step:1338/1770 train_time:132441ms step_avg:98.98ms
step:1339/1770 train_time:132543ms step_avg:98.99ms
step:1340/1770 train_time:132646ms step_avg:98.99ms
step:1341/1770 train_time:132749ms step_avg:98.99ms
step:1342/1770 train_time:132852ms step_avg:99.00ms
step:1343/1770 train_time:132956ms step_avg:99.00ms
step:1344/1770 train_time:133060ms step_avg:99.00ms
step:1345/1770 train_time:133164ms step_avg:99.01ms
step:1346/1770 train_time:133267ms step_avg:99.01ms
step:1347/1770 train_time:133369ms step_avg:99.01ms
step:1348/1770 train_time:133472ms step_avg:99.01ms
step:1349/1770 train_time:133575ms step_avg:99.02ms
step:1350/1770 train_time:133679ms step_avg:99.02ms
step:1351/1770 train_time:133780ms step_avg:99.02ms
step:1352/1770 train_time:133884ms step_avg:99.03ms
step:1353/1770 train_time:133987ms step_avg:99.03ms
step:1354/1770 train_time:134090ms step_avg:99.03ms
step:1355/1770 train_time:134194ms step_avg:99.04ms
step:1356/1770 train_time:134298ms step_avg:99.04ms
step:1357/1770 train_time:134401ms step_avg:99.04ms
step:1358/1770 train_time:134503ms step_avg:99.04ms
step:1359/1770 train_time:134606ms step_avg:99.05ms
step:1360/1770 train_time:134709ms step_avg:99.05ms
step:1361/1770 train_time:134813ms step_avg:99.05ms
step:1362/1770 train_time:134918ms step_avg:99.06ms
step:1363/1770 train_time:135022ms step_avg:99.06ms
step:1364/1770 train_time:135125ms step_avg:99.06ms
step:1365/1770 train_time:135228ms step_avg:99.07ms
step:1366/1770 train_time:135331ms step_avg:99.07ms
step:1367/1770 train_time:135434ms step_avg:99.07ms
step:1368/1770 train_time:135538ms step_avg:99.08ms
step:1369/1770 train_time:135641ms step_avg:99.08ms
step:1370/1770 train_time:135743ms step_avg:99.08ms
step:1371/1770 train_time:135847ms step_avg:99.09ms
step:1372/1770 train_time:135951ms step_avg:99.09ms
step:1373/1770 train_time:136055ms step_avg:99.09ms
step:1374/1770 train_time:136161ms step_avg:99.10ms
step:1375/1770 train_time:136264ms step_avg:99.10ms
step:1375/1770 val_loss:3.3804 train_time:136563ms step_avg:99.32ms
step:1376/1770 train_time:136573ms step_avg:99.25ms
step:1377/1770 train_time:136581ms step_avg:99.19ms
step:1378/1770 train_time:136590ms step_avg:99.12ms
step:1379/1770 train_time:136684ms step_avg:99.12ms
step:1380/1770 train_time:136785ms step_avg:99.12ms
step:1381/1770 train_time:136887ms step_avg:99.12ms
step:1382/1770 train_time:136989ms step_avg:99.12ms
step:1383/1770 train_time:137091ms step_avg:99.13ms
step:1384/1770 train_time:137195ms step_avg:99.13ms
step:1385/1770 train_time:137297ms step_avg:99.13ms
step:1386/1770 train_time:137400ms step_avg:99.13ms
step:1387/1770 train_time:137507ms step_avg:99.14ms
step:1388/1770 train_time:137614ms step_avg:99.15ms
step:1389/1770 train_time:137718ms step_avg:99.15ms
step:1390/1770 train_time:137821ms step_avg:99.15ms
step:1391/1770 train_time:137923ms step_avg:99.15ms
step:1392/1770 train_time:138024ms step_avg:99.16ms
step:1393/1770 train_time:138127ms step_avg:99.16ms
step:1394/1770 train_time:138229ms step_avg:99.16ms
step:1395/1770 train_time:138333ms step_avg:99.16ms
step:1396/1770 train_time:138438ms step_avg:99.17ms
step:1397/1770 train_time:138542ms step_avg:99.17ms
step:1398/1770 train_time:138648ms step_avg:99.18ms
step:1399/1770 train_time:138753ms step_avg:99.18ms
step:1400/1770 train_time:138857ms step_avg:99.18ms
step:1401/1770 train_time:138960ms step_avg:99.19ms
step:1402/1770 train_time:139061ms step_avg:99.19ms
step:1403/1770 train_time:139163ms step_avg:99.19ms
step:1404/1770 train_time:139265ms step_avg:99.19ms
step:1405/1770 train_time:139369ms step_avg:99.19ms
step:1406/1770 train_time:139475ms step_avg:99.20ms
step:1407/1770 train_time:139578ms step_avg:99.20ms
step:1408/1770 train_time:139683ms step_avg:99.21ms
step:1409/1770 train_time:139786ms step_avg:99.21ms
step:1410/1770 train_time:139890ms step_avg:99.21ms
step:1411/1770 train_time:139995ms step_avg:99.22ms
step:1412/1770 train_time:140097ms step_avg:99.22ms
step:1413/1770 train_time:140200ms step_avg:99.22ms
step:1414/1770 train_time:140302ms step_avg:99.22ms
step:1415/1770 train_time:140406ms step_avg:99.23ms
step:1416/1770 train_time:140511ms step_avg:99.23ms
step:1417/1770 train_time:140616ms step_avg:99.24ms
step:1418/1770 train_time:140720ms step_avg:99.24ms
step:1419/1770 train_time:140822ms step_avg:99.24ms
step:1420/1770 train_time:140925ms step_avg:99.24ms
step:1421/1770 train_time:141029ms step_avg:99.25ms
step:1422/1770 train_time:141133ms step_avg:99.25ms
step:1423/1770 train_time:141235ms step_avg:99.25ms
step:1424/1770 train_time:141337ms step_avg:99.25ms
step:1425/1770 train_time:141440ms step_avg:99.26ms
step:1426/1770 train_time:141543ms step_avg:99.26ms
step:1427/1770 train_time:141646ms step_avg:99.26ms
step:1428/1770 train_time:141751ms step_avg:99.27ms
step:1429/1770 train_time:141855ms step_avg:99.27ms
step:1430/1770 train_time:141958ms step_avg:99.27ms
step:1431/1770 train_time:142060ms step_avg:99.27ms
step:1432/1770 train_time:142163ms step_avg:99.28ms
step:1433/1770 train_time:142266ms step_avg:99.28ms
step:1434/1770 train_time:142370ms step_avg:99.28ms
step:1435/1770 train_time:142474ms step_avg:99.29ms
step:1436/1770 train_time:142578ms step_avg:99.29ms
step:1437/1770 train_time:142681ms step_avg:99.29ms
step:1438/1770 train_time:142788ms step_avg:99.30ms
step:1439/1770 train_time:142891ms step_avg:99.30ms
step:1440/1770 train_time:142994ms step_avg:99.30ms
step:1441/1770 train_time:143096ms step_avg:99.30ms
step:1442/1770 train_time:143199ms step_avg:99.31ms
step:1443/1770 train_time:143303ms step_avg:99.31ms
step:1444/1770 train_time:143405ms step_avg:99.31ms
step:1445/1770 train_time:143509ms step_avg:99.31ms
step:1446/1770 train_time:143616ms step_avg:99.32ms
step:1447/1770 train_time:143720ms step_avg:99.32ms
step:1448/1770 train_time:143824ms step_avg:99.33ms
step:1449/1770 train_time:143928ms step_avg:99.33ms
step:1450/1770 train_time:144033ms step_avg:99.33ms
step:1451/1770 train_time:144140ms step_avg:99.34ms
step:1452/1770 train_time:144243ms step_avg:99.34ms
step:1453/1770 train_time:144347ms step_avg:99.34ms
step:1454/1770 train_time:144451ms step_avg:99.35ms
step:1455/1770 train_time:144558ms step_avg:99.35ms
step:1456/1770 train_time:144662ms step_avg:99.36ms
step:1457/1770 train_time:144766ms step_avg:99.36ms
step:1458/1770 train_time:144871ms step_avg:99.36ms
step:1459/1770 train_time:144978ms step_avg:99.37ms
step:1460/1770 train_time:145082ms step_avg:99.37ms
step:1461/1770 train_time:145185ms step_avg:99.37ms
step:1462/1770 train_time:145289ms step_avg:99.38ms
step:1463/1770 train_time:145394ms step_avg:99.38ms
step:1464/1770 train_time:145500ms step_avg:99.39ms
step:1465/1770 train_time:145605ms step_avg:99.39ms
step:1466/1770 train_time:145709ms step_avg:99.39ms
step:1467/1770 train_time:145814ms step_avg:99.40ms
step:1468/1770 train_time:145920ms step_avg:99.40ms
step:1469/1770 train_time:146025ms step_avg:99.40ms
step:1470/1770 train_time:146129ms step_avg:99.41ms
step:1471/1770 train_time:146232ms step_avg:99.41ms
step:1472/1770 train_time:146336ms step_avg:99.41ms
step:1473/1770 train_time:146439ms step_avg:99.42ms
step:1474/1770 train_time:146544ms step_avg:99.42ms
step:1475/1770 train_time:146651ms step_avg:99.42ms
step:1476/1770 train_time:146755ms step_avg:99.43ms
step:1477/1770 train_time:146859ms step_avg:99.43ms
step:1478/1770 train_time:146964ms step_avg:99.43ms
step:1479/1770 train_time:147068ms step_avg:99.44ms
step:1480/1770 train_time:147172ms step_avg:99.44ms
step:1481/1770 train_time:147279ms step_avg:99.45ms
step:1482/1770 train_time:147384ms step_avg:99.45ms
step:1483/1770 train_time:147491ms step_avg:99.45ms
step:1484/1770 train_time:147595ms step_avg:99.46ms
step:1485/1770 train_time:147699ms step_avg:99.46ms
step:1486/1770 train_time:147804ms step_avg:99.46ms
step:1487/1770 train_time:147908ms step_avg:99.47ms
step:1488/1770 train_time:148013ms step_avg:99.47ms
step:1489/1770 train_time:148116ms step_avg:99.47ms
step:1490/1770 train_time:148222ms step_avg:99.48ms
step:1491/1770 train_time:148328ms step_avg:99.48ms
step:1492/1770 train_time:148431ms step_avg:99.48ms
step:1493/1770 train_time:148535ms step_avg:99.49ms
step:1494/1770 train_time:148639ms step_avg:99.49ms
step:1495/1770 train_time:148746ms step_avg:99.50ms
step:1496/1770 train_time:148854ms step_avg:99.50ms
step:1497/1770 train_time:148959ms step_avg:99.51ms
step:1498/1770 train_time:149062ms step_avg:99.51ms
step:1499/1770 train_time:149165ms step_avg:99.51ms
step:1500/1770 train_time:149269ms step_avg:99.51ms
step:1500/1770 val_loss:3.3426 train_time:149570ms step_avg:99.71ms
step:1501/1770 train_time:149581ms step_avg:99.65ms
step:1502/1770 train_time:149590ms step_avg:99.59ms
step:1503/1770 train_time:149598ms step_avg:99.53ms
step:1504/1770 train_time:149690ms step_avg:99.53ms
step:1505/1770 train_time:149796ms step_avg:99.53ms
step:1506/1770 train_time:149899ms step_avg:99.53ms
step:1507/1770 train_time:150004ms step_avg:99.54ms
step:1508/1770 train_time:150107ms step_avg:99.54ms
step:1509/1770 train_time:150211ms step_avg:99.54ms
step:1510/1770 train_time:150314ms step_avg:99.55ms
step:1511/1770 train_time:150418ms step_avg:99.55ms
step:1512/1770 train_time:150524ms step_avg:99.55ms
step:1513/1770 train_time:150632ms step_avg:99.56ms
step:1514/1770 train_time:150737ms step_avg:99.56ms
step:1515/1770 train_time:150840ms step_avg:99.56ms
step:1516/1770 train_time:150943ms step_avg:99.57ms
step:1517/1770 train_time:151047ms step_avg:99.57ms
step:1518/1770 train_time:151151ms step_avg:99.57ms
step:1519/1770 train_time:151254ms step_avg:99.57ms
step:1520/1770 train_time:151359ms step_avg:99.58ms
step:1521/1770 train_time:151464ms step_avg:99.58ms
step:1522/1770 train_time:151569ms step_avg:99.59ms
step:1523/1770 train_time:151676ms step_avg:99.59ms
step:1524/1770 train_time:151780ms step_avg:99.59ms
step:1525/1770 train_time:151885ms step_avg:99.60ms
step:1526/1770 train_time:151989ms step_avg:99.60ms
step:1527/1770 train_time:152093ms step_avg:99.60ms
step:1528/1770 train_time:152196ms step_avg:99.60ms
step:1529/1770 train_time:152301ms step_avg:99.61ms
step:1530/1770 train_time:152407ms step_avg:99.61ms
step:1531/1770 train_time:152513ms step_avg:99.62ms
step:1532/1770 train_time:152619ms step_avg:99.62ms
step:1533/1770 train_time:152722ms step_avg:99.62ms
step:1534/1770 train_time:152827ms step_avg:99.63ms
step:1535/1770 train_time:152933ms step_avg:99.63ms
step:1536/1770 train_time:153036ms step_avg:99.63ms
step:1537/1770 train_time:153140ms step_avg:99.64ms
step:1538/1770 train_time:153244ms step_avg:99.64ms
step:1539/1770 train_time:153349ms step_avg:99.64ms
step:1540/1770 train_time:153456ms step_avg:99.65ms
step:1541/1770 train_time:153560ms step_avg:99.65ms
step:1542/1770 train_time:153666ms step_avg:99.65ms
step:1543/1770 train_time:153771ms step_avg:99.66ms
step:1544/1770 train_time:153877ms step_avg:99.66ms
step:1545/1770 train_time:153981ms step_avg:99.66ms
step:1546/1770 train_time:154085ms step_avg:99.67ms
step:1547/1770 train_time:154191ms step_avg:99.67ms
step:1548/1770 train_time:154296ms step_avg:99.67ms
step:1549/1770 train_time:154398ms step_avg:99.68ms
step:1550/1770 train_time:154502ms step_avg:99.68ms
step:1551/1770 train_time:154607ms step_avg:99.68ms
step:1552/1770 train_time:154712ms step_avg:99.69ms
step:1553/1770 train_time:154815ms step_avg:99.69ms
step:1554/1770 train_time:154920ms step_avg:99.69ms
step:1555/1770 train_time:155024ms step_avg:99.69ms
step:1556/1770 train_time:155127ms step_avg:99.70ms
step:1557/1770 train_time:155232ms step_avg:99.70ms
step:1558/1770 train_time:155336ms step_avg:99.70ms
step:1559/1770 train_time:155440ms step_avg:99.70ms
step:1560/1770 train_time:155544ms step_avg:99.71ms
step:1561/1770 train_time:155649ms step_avg:99.71ms
step:1562/1770 train_time:155754ms step_avg:99.71ms
step:1563/1770 train_time:155860ms step_avg:99.72ms
step:1564/1770 train_time:155963ms step_avg:99.72ms
step:1565/1770 train_time:156067ms step_avg:99.72ms
step:1566/1770 train_time:156172ms step_avg:99.73ms
step:1567/1770 train_time:156276ms step_avg:99.73ms
step:1568/1770 train_time:156380ms step_avg:99.73ms
step:1569/1770 train_time:156485ms step_avg:99.74ms
step:1570/1770 train_time:156589ms step_avg:99.74ms
step:1571/1770 train_time:156696ms step_avg:99.74ms
step:1572/1770 train_time:156801ms step_avg:99.75ms
step:1573/1770 train_time:156906ms step_avg:99.75ms
step:1574/1770 train_time:157010ms step_avg:99.75ms
step:1575/1770 train_time:157116ms step_avg:99.76ms
step:1576/1770 train_time:157220ms step_avg:99.76ms
step:1577/1770 train_time:157325ms step_avg:99.76ms
step:1578/1770 train_time:157429ms step_avg:99.76ms
step:1579/1770 train_time:157536ms step_avg:99.77ms
step:1580/1770 train_time:157642ms step_avg:99.77ms
step:1581/1770 train_time:157745ms step_avg:99.78ms
step:1582/1770 train_time:157851ms step_avg:99.78ms
step:1583/1770 train_time:157956ms step_avg:99.78ms
step:1584/1770 train_time:158060ms step_avg:99.79ms
step:1585/1770 train_time:158164ms step_avg:99.79ms
step:1586/1770 train_time:158270ms step_avg:99.79ms
step:1587/1770 train_time:158373ms step_avg:99.79ms
step:1588/1770 train_time:158479ms step_avg:99.80ms
step:1589/1770 train_time:158583ms step_avg:99.80ms
step:1590/1770 train_time:158687ms step_avg:99.80ms
step:1591/1770 train_time:158793ms step_avg:99.81ms
step:1592/1770 train_time:158897ms step_avg:99.81ms
step:1593/1770 train_time:158999ms step_avg:99.81ms
step:1594/1770 train_time:159104ms step_avg:99.81ms
step:1595/1770 train_time:159209ms step_avg:99.82ms
step:1596/1770 train_time:159314ms step_avg:99.82ms
step:1597/1770 train_time:159418ms step_avg:99.82ms
step:1598/1770 train_time:159522ms step_avg:99.83ms
step:1599/1770 train_time:159625ms step_avg:99.83ms
step:1600/1770 train_time:159731ms step_avg:99.83ms
step:1601/1770 train_time:159836ms step_avg:99.84ms
step:1602/1770 train_time:159942ms step_avg:99.84ms
step:1603/1770 train_time:160046ms step_avg:99.84ms
step:1604/1770 train_time:160151ms step_avg:99.84ms
step:1605/1770 train_time:160255ms step_avg:99.85ms
step:1606/1770 train_time:160359ms step_avg:99.85ms
step:1607/1770 train_time:160464ms step_avg:99.85ms
step:1608/1770 train_time:160569ms step_avg:99.86ms
step:1609/1770 train_time:160679ms step_avg:99.86ms
step:1610/1770 train_time:160784ms step_avg:99.87ms
step:1611/1770 train_time:160890ms step_avg:99.87ms
step:1612/1770 train_time:160995ms step_avg:99.87ms
step:1613/1770 train_time:161100ms step_avg:99.88ms
step:1614/1770 train_time:161204ms step_avg:99.88ms
step:1615/1770 train_time:161308ms step_avg:99.88ms
step:1616/1770 train_time:161413ms step_avg:99.88ms
step:1617/1770 train_time:161519ms step_avg:99.89ms
step:1618/1770 train_time:161624ms step_avg:99.89ms
step:1619/1770 train_time:161729ms step_avg:99.89ms
step:1620/1770 train_time:161836ms step_avg:99.90ms
step:1621/1770 train_time:161940ms step_avg:99.90ms
step:1622/1770 train_time:162046ms step_avg:99.90ms
step:1623/1770 train_time:162150ms step_avg:99.91ms
step:1624/1770 train_time:162254ms step_avg:99.91ms
step:1625/1770 train_time:162361ms step_avg:99.91ms
step:1625/1770 val_loss:3.3080 train_time:162662ms step_avg:100.10ms
step:1626/1770 train_time:162673ms step_avg:100.04ms
step:1627/1770 train_time:162682ms step_avg:99.99ms
step:1628/1770 train_time:162690ms step_avg:99.93ms
step:1629/1770 train_time:162782ms step_avg:99.93ms
step:1630/1770 train_time:162885ms step_avg:99.93ms
step:1631/1770 train_time:162988ms step_avg:99.93ms
step:1632/1770 train_time:163090ms step_avg:99.93ms
step:1633/1770 train_time:163193ms step_avg:99.93ms
step:1634/1770 train_time:163295ms step_avg:99.94ms
step:1635/1770 train_time:163399ms step_avg:99.94ms
step:1636/1770 train_time:163502ms step_avg:99.94ms
step:1637/1770 train_time:163611ms step_avg:99.95ms
step:1638/1770 train_time:163718ms step_avg:99.95ms
step:1639/1770 train_time:163824ms step_avg:99.95ms
step:1640/1770 train_time:163927ms step_avg:99.96ms
step:1641/1770 train_time:164031ms step_avg:99.96ms
step:1642/1770 train_time:164135ms step_avg:99.96ms
step:1643/1770 train_time:164238ms step_avg:99.96ms
step:1644/1770 train_time:164341ms step_avg:99.96ms
step:1645/1770 train_time:164446ms step_avg:99.97ms
step:1646/1770 train_time:164552ms step_avg:99.97ms
step:1647/1770 train_time:164658ms step_avg:99.97ms
step:1648/1770 train_time:164767ms step_avg:99.98ms
step:1649/1770 train_time:164873ms step_avg:99.98ms
step:1650/1770 train_time:164976ms step_avg:99.99ms
step:1651/1770 train_time:165080ms step_avg:99.99ms
step:1652/1770 train_time:165184ms step_avg:99.99ms
step:1653/1770 train_time:165286ms step_avg:99.99ms
step:1654/1770 train_time:165391ms step_avg:99.99ms
step:1655/1770 train_time:165495ms step_avg:100.00ms
step:1656/1770 train_time:165602ms step_avg:100.00ms
step:1657/1770 train_time:165708ms step_avg:100.00ms
step:1658/1770 train_time:165813ms step_avg:100.01ms
step:1659/1770 train_time:165919ms step_avg:100.01ms
step:1660/1770 train_time:166024ms step_avg:100.01ms
step:1661/1770 train_time:166129ms step_avg:100.02ms
step:1662/1770 train_time:166231ms step_avg:100.02ms
step:1663/1770 train_time:166335ms step_avg:100.02ms
step:1664/1770 train_time:166439ms step_avg:100.02ms
step:1665/1770 train_time:166544ms step_avg:100.03ms
step:1666/1770 train_time:166650ms step_avg:100.03ms
step:1667/1770 train_time:166755ms step_avg:100.03ms
step:1668/1770 train_time:166860ms step_avg:100.04ms
step:1669/1770 train_time:166965ms step_avg:100.04ms
step:1670/1770 train_time:167069ms step_avg:100.04ms
step:1671/1770 train_time:167172ms step_avg:100.04ms
step:1672/1770 train_time:167276ms step_avg:100.05ms
step:1673/1770 train_time:167381ms step_avg:100.05ms
step:1674/1770 train_time:167486ms step_avg:100.05ms
step:1675/1770 train_time:167592ms step_avg:100.05ms
step:1676/1770 train_time:167695ms step_avg:100.06ms
step:1677/1770 train_time:167801ms step_avg:100.06ms
step:1678/1770 train_time:167909ms step_avg:100.06ms
step:1679/1770 train_time:168016ms step_avg:100.07ms
step:1680/1770 train_time:168119ms step_avg:100.07ms
step:1681/1770 train_time:168223ms step_avg:100.07ms
step:1682/1770 train_time:168326ms step_avg:100.08ms
step:1683/1770 train_time:168430ms step_avg:100.08ms
step:1684/1770 train_time:168537ms step_avg:100.08ms
step:1685/1770 train_time:168642ms step_avg:100.08ms
step:1686/1770 train_time:168748ms step_avg:100.09ms
step:1687/1770 train_time:168853ms step_avg:100.09ms
step:1688/1770 train_time:168960ms step_avg:100.09ms
step:1689/1770 train_time:169068ms step_avg:100.10ms
step:1690/1770 train_time:169172ms step_avg:100.10ms
step:1691/1770 train_time:169275ms step_avg:100.10ms
step:1692/1770 train_time:169379ms step_avg:100.11ms
step:1693/1770 train_time:169484ms step_avg:100.11ms
step:1694/1770 train_time:169590ms step_avg:100.11ms
step:1695/1770 train_time:169694ms step_avg:100.11ms
step:1696/1770 train_time:169799ms step_avg:100.12ms
step:1697/1770 train_time:169905ms step_avg:100.12ms
step:1698/1770 train_time:170011ms step_avg:100.12ms
step:1699/1770 train_time:170116ms step_avg:100.13ms
step:1700/1770 train_time:170220ms step_avg:100.13ms
step:1701/1770 train_time:170324ms step_avg:100.13ms
step:1702/1770 train_time:170427ms step_avg:100.13ms
step:1703/1770 train_time:170531ms step_avg:100.14ms
step:1704/1770 train_time:170634ms step_avg:100.14ms
step:1705/1770 train_time:170739ms step_avg:100.14ms
step:1706/1770 train_time:170848ms step_avg:100.15ms
step:1707/1770 train_time:170953ms step_avg:100.15ms
step:1708/1770 train_time:171058ms step_avg:100.15ms
step:1709/1770 train_time:171165ms step_avg:100.15ms
step:1710/1770 train_time:171269ms step_avg:100.16ms
step:1711/1770 train_time:171374ms step_avg:100.16ms
step:1712/1770 train_time:171485ms step_avg:100.17ms
step:1713/1770 train_time:171594ms step_avg:100.17ms
step:1714/1770 train_time:171698ms step_avg:100.17ms
step:1715/1770 train_time:171805ms step_avg:100.18ms
step:1716/1770 train_time:171909ms step_avg:100.18ms
step:1717/1770 train_time:172015ms step_avg:100.18ms
step:1718/1770 train_time:172121ms step_avg:100.19ms
step:1719/1770 train_time:172226ms step_avg:100.19ms
step:1720/1770 train_time:172330ms step_avg:100.19ms
step:1721/1770 train_time:172437ms step_avg:100.20ms
step:1722/1770 train_time:172545ms step_avg:100.20ms
step:1723/1770 train_time:172651ms step_avg:100.20ms
step:1724/1770 train_time:172758ms step_avg:100.21ms
step:1725/1770 train_time:172864ms step_avg:100.21ms
step:1726/1770 train_time:172971ms step_avg:100.21ms
step:1727/1770 train_time:173078ms step_avg:100.22ms
step:1728/1770 train_time:173185ms step_avg:100.22ms
step:1729/1770 train_time:173291ms step_avg:100.23ms
step:1730/1770 train_time:173397ms step_avg:100.23ms
step:1731/1770 train_time:173503ms step_avg:100.23ms
step:1732/1770 train_time:173608ms step_avg:100.24ms
step:1733/1770 train_time:173713ms step_avg:100.24ms
step:1734/1770 train_time:173819ms step_avg:100.24ms
step:1735/1770 train_time:173926ms step_avg:100.25ms
step:1736/1770 train_time:174030ms step_avg:100.25ms
step:1737/1770 train_time:174135ms step_avg:100.25ms
step:1738/1770 train_time:174240ms step_avg:100.25ms
step:1739/1770 train_time:174347ms step_avg:100.26ms
step:1740/1770 train_time:174452ms step_avg:100.26ms
step:1741/1770 train_time:174556ms step_avg:100.26ms
step:1742/1770 train_time:174662ms step_avg:100.26ms
step:1743/1770 train_time:174769ms step_avg:100.27ms
step:1744/1770 train_time:174876ms step_avg:100.27ms
step:1745/1770 train_time:174982ms step_avg:100.28ms
step:1746/1770 train_time:175088ms step_avg:100.28ms
step:1747/1770 train_time:175193ms step_avg:100.28ms
step:1748/1770 train_time:175301ms step_avg:100.29ms
step:1749/1770 train_time:175406ms step_avg:100.29ms
step:1750/1770 train_time:175514ms step_avg:100.29ms
step:1750/1770 val_loss:3.2813 train_time:175816ms step_avg:100.47ms
step:1751/1770 train_time:175826ms step_avg:100.41ms
step:1752/1770 train_time:175836ms step_avg:100.36ms
step:1753/1770 train_time:175844ms step_avg:100.31ms
step:1754/1770 train_time:175935ms step_avg:100.31ms
step:1755/1770 train_time:176040ms step_avg:100.31ms
step:1756/1770 train_time:176144ms step_avg:100.31ms
step:1757/1770 train_time:176248ms step_avg:100.31ms
step:1758/1770 train_time:176352ms step_avg:100.31ms
step:1759/1770 train_time:176457ms step_avg:100.32ms
step:1760/1770 train_time:176561ms step_avg:100.32ms
step:1761/1770 train_time:176667ms step_avg:100.32ms
step:1762/1770 train_time:176778ms step_avg:100.33ms
step:1763/1770 train_time:176885ms step_avg:100.33ms
step:1764/1770 train_time:176995ms step_avg:100.34ms
step:1765/1770 train_time:177099ms step_avg:100.34ms
step:1766/1770 train_time:177202ms step_avg:100.34ms
step:1767/1770 train_time:177307ms step_avg:100.34ms
step:1768/1770 train_time:177415ms step_avg:100.35ms
step:1769/1770 train_time:177517ms step_avg:100.35ms
step:1770/1770 train_time:177622ms step_avg:100.35ms
step:1770/1770 val_loss:3.2782 train_time:177930ms step_avg:100.53ms
peak memory allocated: 29784 MiB reserved: 40536 MiB