task_id: 2.0t_0.3.3_continue
log_root_prefix: ./work_dirs/dreamforge-t-log
projname: ${model.name}
try_run: false
debug: false
log_root: ???
init_method: env://
seed: 42
fix_seed_within_batch: false
resume_from_checkpoint: ./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/
resume_reset_scheduler: false
validation_only: false
model:
  name: SDv1.5mv-rawbox-t
  pretrained_model_name_or_path: ./pretrained/stable-diffusion-v1-5/
  bbox_mode: all-xyz
  bbox_view_shared: false
  crossview_attn_type: basic
  train_with_same_noise: false
  train_with_same_t: true
  runner_module: dreamforge.runner.multiview_t_runner.MultiviewTRunner
  pipe_module: dreamforge.pipeline.pipeline_bev_controlnet_t.StableDiffusionBEVControlNetTPipeline
  unet_module: dreamforge.networks.unet_2d_condition_multiview_st.UNet2DConditionModelMultiviewSceneT
  use_fp32_for_unet_trainable: true
  unet_dir: unet
  unet:
    trainable_state: only_new
    neighboring_view_pair: ${dataset.neighboring_view_pair}
    neighboring_attn_type: add
    zero_module_type: zero_linear
    crossview_attn_type: ${..crossview_attn_type}
    img_size: ${dataset.image_size}
    video_length: ${..video_length}
    temp_pos_emb: learnable
    zero_module_type2: none
    spatial_trainable: true
    with_ref: true
    ref_length: ${..ref_length}
    with_can_bus: true
    with_motion: true
    transformer_type: _ff_last
  model_module: dreamforge.networks.unet_addon_rawbox.BEVControlNetModel
  controlnet_dir: controlnet
  controlnet:
    camera_in_dim: 189
    camera_out_dim: 768
    map_size:
    - 4
    - 200
    - 200
    conditioning_embedding_out_channels:
    - 16
    - 32
    - 96
    - 256
    uncond_cam_in_dim:
    - 3
    - 7
    use_uncond_map: null
    drop_cond_ratio: 0.25
    drop_cam_num: 6
    drop_cam_with_box: false
    cam_embedder_param:
      input_dims: 3
      num_freqs: 4
      include_input: true
      log_sampling: true
    bbox_embedder_cls: dreamforge.networks.bbox_embedder.ContinuousBBoxWithTextEmbedding
    bbox_embedder_param:
      n_classes: 10
      class_token_dim: 768
      trainable_class_token: false
      use_text_encoder_init: true
      embedder_num_freq: 4
      proj_dims:
      - 768
      - 512
      - 512
      - 768
      mode: ${...bbox_mode}
      minmax_normalize: false
    with_layout_canvas: true
    canvas_conditioning_channels: 14
  load_pretrain_from: null
  allow_partial_load: false
  pretrained_dreamforge: ./pretrained/dreamforge-s
  train_with_same_noise_t: false
  video_length: 7
  ref_length: 2
  sc_attn_index:
  - - 0
    - 6
    - 0
  - - 0
    - 6
    - 0
  - - 0
    - 6
    - 1
  - - 0
    - 6
    - 2
  - - 0
    - 6
    - 3
  - - 0
    - 6
    - 4
  - - 0
    - 6
    - 5
  scene_embedder_cls: dreamforge.networks.scene_position_embedder.ScenePositionEmbedding
  scene_embedder_dir: scene_embedder
  scene_embedder:
    embed_dims: 320
    LID: false
  can_bus_embedder_cls: dreamforge.networks.can_bus_embedder.CanbusEmbedding
  can_bus_embedder_dir: can_bus_embedder
  can_bus_embedder:
    embed_dims: 768
    input_channels: 9
    can_bus_norm: true
  fix_controlnet: true
dataset:
  dataset_type: NuScenesMapDataset
  dataset_root: ./data/nuscenes
  dataset_process_root: ./data/nuscenes_mmdet3d-12Hz_description/
  dataset_cache_file_tag: 8x200x200_12Hz_interp
  dataset_cache_dirname: nuscenes_map_aux_12Hz_interp
  dataset_cache_file:
  - ${..dataset_process_root}../${..dataset_cache_dirname}/train_${..dataset_cache_file_tag}.h5
  - ${..dataset_process_root}../${..dataset_cache_dirname}/val_${..dataset_cache_file_tag}.h5
  template: A driving scene image at {location}. {description}.
  collect_meta_keys:
  - camera_intrinsics
  - lidar2ego
  - lidar2camera
  - camera2lidar
  - lidar2image
  - img_aug_matrix
  - camera2ego
  - ego2global
  collect_meta_lis_keys:
  - timeofday
  - location
  - description
  - filename
  - token
  - ori_shape
  image_size:
  - 224
  - 400
  map_bound:
    x:
    - -50.0
    - 50.0
    - 0.5
    'y':
    - -50.0
    - 50.0
    - 0.5
  view_order:
  - CAM_FRONT_LEFT
  - CAM_FRONT
  - CAM_FRONT_RIGHT
  - CAM_BACK_RIGHT
  - CAM_BACK
  - CAM_BACK_LEFT
  neighboring_view_pair:
    0:
    - 5
    - 1
    1:
    - 0
    - 2
    2:
    - 1
    - 3
    3:
    - 2
    - 4
    4:
    - 3
    - 5
    5:
    - 4
    - 0
  back_resize:
  - 896
  - 1600
  back_pad:
  - 0
  - 4
  - 0
  - 0
  augment2d:
    resize:
    - - 0.25
      - 0.25
    rotate: null
  aux_data:
  - visibility
  - center_offset
  - center_ohw
  - height
  augment3d:
    scale:
    - 1.0
    - 1.0
    rotate:
    - 0.0
    - 0.0
    translate: 0
    flip_ratio: 0.0
    flip_direction: null
  object_classes:
  - car
  - truck
  - construction_vehicle
  - bus
  - trailer
  - barrier
  - motorcycle
  - bicycle
  - pedestrian
  - traffic_cone
  map_classes:
  - drivable_area
  - ped_crossing
  - walkway
  - stop_line
  - carpark_area
  - road_divider
  - lane_divider
  - road_block
  input_modality:
    use_lidar: false
    use_camera: true
    use_radar: false
    use_map: false
    use_external: false
  train_pipeline:
  - type: LoadMultiViewImageFromFiles
    to_float32: true
  - type: LoadAnnotations3D
    with_bbox_3d: true
    with_label_3d: true
    with_attr_label: false
  - type: ImageAug3D
    final_dim: ${...image_size}
    resize_lim: ${...augment2d.resize[0]}
    bot_pct_lim:
    - 0.0
    - 0.0
    rot_lim: ${...augment2d.rotate}
    rand_flip: false
    is_train: false
  - type: GlobalRotScaleTrans
    resize_lim: ${...augment3d.scale}
    rot_lim: ${...augment3d.rotate}
    trans_lim: ${...augment3d.translate}
    is_train: true
  - type: ObjectNameFilter
    classes: ${...object_classes}
  - type: LoadBEVSegmentation
    dataset_root: ${...dataset_root}
    xbound: ${...map_bound.x}
    ybound: ${...map_bound.y}
    classes: ${...map_classes}
    object_classes: null
    aux_data: null
    cache_file: ${...dataset_cache_file.0}
  - type: RandomFlip3DwithViews
    flip_ratio: ${...augment3d.flip_ratio}
    direction: ${...augment3d.flip_direction}
  - type: ReorderMultiViewImages
    order: ${...view_order}
    safe: false
  - type: ImageNormalize
    mean:
    - 0.5
    - 0.5
    - 0.5
    std:
    - 0.5
    - 0.5
    - 0.5
  - type: DefaultFormatBundle3D
    classes: ${...object_classes}
  - type: Collect3D
    keys:
    - img
    - gt_bboxes_3d
    - gt_labels_3d
    - gt_masks_bev
    meta_keys: ${...collect_meta_keys}
    meta_lis_keys: ${...collect_meta_lis_keys}
  test_pipeline:
  - type: LoadMultiViewImageFromFiles
    to_float32: true
  - type: LoadAnnotations3D
    with_bbox_3d: true
    with_label_3d: true
    with_attr_label: false
  - type: ImageAug3D
    final_dim: ${...image_size}
    resize_lim: ${...augment2d.resize[0]}
    bot_pct_lim:
    - 0.0
    - 0.0
    rot_lim:
    - 0.0
    - 0.0
    rand_flip: false
    is_train: false
  - type: GlobalRotScaleTrans
    resize_lim: ${...augment3d.scale}
    rot_lim: ${...augment3d.rotate}
    trans_lim: ${...augment3d.translate}
    is_train: true
  - type: ObjectNameFilter
    classes: ${...object_classes}
  - type: LoadBEVSegmentation
    dataset_root: ${...dataset_root}
    xbound: ${...map_bound.x}
    ybound: ${...map_bound.y}
    classes: ${...map_classes}
    object_classes: null
    aux_data: null
    cache_file: ${...dataset_cache_file.1}
  - type: ReorderMultiViewImages
    order: ${...view_order}
    safe: false
  - type: ImageNormalize
    mean:
    - 0.5
    - 0.5
    - 0.5
    std:
    - 0.5
    - 0.5
    - 0.5
  - type: DefaultFormatBundle3D
    classes: ${...object_classes}
  - type: Collect3D
    keys:
    - img
    - gt_bboxes_3d
    - gt_labels_3d
    - gt_masks_bev
    meta_keys: ${...collect_meta_keys}
    meta_lis_keys: ${...collect_meta_lis_keys}
  data:
    train:
      type: ${...dataset_type}
      dataset_root: ${...dataset_root}
      ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_train.pickle
      pipeline: ${...train_pipeline}
      object_classes: ${...object_classes}
      map_classes: ${...map_classes}
      modality: ${...input_modality}
      test_mode: false
      force_all_boxes: true
      box_type_3d: LiDAR
      filter_empty_gt: false
      video_length: ${model.video_length}
      start_on_keyframe: ${dataset.start_on_keyframe}
      ref_length: ${model.ref_length}
      candidate_length: 5
    val:
      type: ${...dataset_type}
      dataset_root: ${...dataset_root}
      ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
      pipeline: ${...test_pipeline}
      object_classes: ${...object_classes}
      map_classes: ${...map_classes}
      modality: ${...input_modality}
      test_mode: false
      force_all_boxes: true
      box_type_3d: LiDAR
      filter_empty_gt: false
      video_length: ${model.video_length}
      start_on_keyframe: ${dataset.start_on_keyframe}
      ref_length: ${model.ref_length}
      candidate_length: 5
    test:
      type: ${...dataset_type}
      dataset_root: ${...dataset_root}
      ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
      pipeline: ${...test_pipeline}
      object_classes: ${...object_classes}
      map_classes: ${...map_classes}
      modality: ${...input_modality}
      test_mode: true
      force_all_boxes: true
      box_type_3d: LiDAR
      filter_empty_gt: false
      video_length: ${model.video_length}
      start_on_keyframe: ${dataset.start_on_keyframe}
      ref_length: ${model.ref_length}
      candidate_length: 5
  start_on_keyframe: true
accelerator:
  gradient_accumulation_steps: 1
  mixed_precision: fp16
  report_to: tensorboard
runner:
  foreground_loss_mode: null
  foreground_loss_weight: 0.0
  bbox_drop_ratio: 0
  bbox_add_ratio: 0
  bbox_add_num: 3
  keyframe_rate: 1
  num_train_epochs: 100
  train_batch_size: 1
  max_train_steps: null
  num_workers: 4
  prefetch_factor: 4
  display_per_epoch: 40
  display_per_n_min: 10
  max_grad_norm: 1.0
  set_grads_to_none: true
  enable_xformers_memory_efficient_attention: true
  unet_in_fp16: true
  enable_unet_checkpointing: false
  enable_controlnet_checkpointing: false
  noise_offset: 0.0
  train_with_same_offset: true
  use_8bit_adam: false
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_weight_decay: 0.01
  adam_epsilon: 1.0e-08
  learning_rate: 8.0e-05
  lr_scheduler: constant_with_warmup
  gradient_accumulation_steps: 1
  lr_num_cycles: 1
  lr_power: 1.0
  lr_warmup_steps: 3000
  checkpointing_steps: 10000
  validation_steps: 5000
  save_model_per_epoch: 10
  validation_before_run: true
  validation_index:
  - 138
  - 632
  - 1301
  - 2342
  validation_times: 1
  validation_batch_size: 1
  validation_show_box: true
  validation_seed_global: false
  pipeline_param:
    guidance_scale: 2
    num_inference_steps: 20
    eta: 0.0
    controlnet_conditioning_scale: 1.0
    guess_mode: false
    use_zero_map_as_unconditional: false
    bbox_max_length: null
    init_noise: both
    view_order: ${dataset.view_order}
    keyframe_rate: 6