dreamforge-t / hydra /config.yaml
Jianbiao's picture
init
0fa8de7
task_id: 2.0t_0.3.3_continue
log_root_prefix: ./work_dirs/dreamforge-t-log
projname: ${model.name}
try_run: false
debug: false
log_root: ???
init_method: env://
seed: 42
fix_seed_within_batch: false
resume_from_checkpoint: ./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/
resume_reset_scheduler: false
validation_only: false
model:
name: SDv1.5mv-rawbox-t
pretrained_model_name_or_path: ./pretrained/stable-diffusion-v1-5/
bbox_mode: all-xyz
bbox_view_shared: false
crossview_attn_type: basic
train_with_same_noise: false
train_with_same_t: true
runner_module: dreamforge.runner.multiview_t_runner.MultiviewTRunner
pipe_module: dreamforge.pipeline.pipeline_bev_controlnet_t.StableDiffusionBEVControlNetTPipeline
unet_module: dreamforge.networks.unet_2d_condition_multiview_st.UNet2DConditionModelMultiviewSceneT
use_fp32_for_unet_trainable: true
unet_dir: unet
unet:
trainable_state: only_new
neighboring_view_pair: ${dataset.neighboring_view_pair}
neighboring_attn_type: add
zero_module_type: zero_linear
crossview_attn_type: ${..crossview_attn_type}
img_size: ${dataset.image_size}
video_length: ${..video_length}
temp_pos_emb: learnable
zero_module_type2: none
spatial_trainable: true
with_ref: true
ref_length: ${..ref_length}
with_can_bus: true
with_motion: true
transformer_type: _ff_last
model_module: dreamforge.networks.unet_addon_rawbox.BEVControlNetModel
controlnet_dir: controlnet
controlnet:
camera_in_dim: 189
camera_out_dim: 768
map_size:
- 4
- 200
- 200
conditioning_embedding_out_channels:
- 16
- 32
- 96
- 256
uncond_cam_in_dim:
- 3
- 7
use_uncond_map: null
drop_cond_ratio: 0.25
drop_cam_num: 6
drop_cam_with_box: false
cam_embedder_param:
input_dims: 3
num_freqs: 4
include_input: true
log_sampling: true
bbox_embedder_cls: dreamforge.networks.bbox_embedder.ContinuousBBoxWithTextEmbedding
bbox_embedder_param:
n_classes: 10
class_token_dim: 768
trainable_class_token: false
use_text_encoder_init: true
embedder_num_freq: 4
proj_dims:
- 768
- 512
- 512
- 768
mode: ${...bbox_mode}
minmax_normalize: false
with_layout_canvas: true
canvas_conditioning_channels: 14
load_pretrain_from: null
allow_partial_load: false
pretrained_dreamforge: ./pretrained/dreamforge-s
train_with_same_noise_t: false
video_length: 7
ref_length: 2
sc_attn_index:
- - 0
- 6
- 0
- - 0
- 6
- 0
- - 0
- 6
- 1
- - 0
- 6
- 2
- - 0
- 6
- 3
- - 0
- 6
- 4
- - 0
- 6
- 5
scene_embedder_cls: dreamforge.networks.scene_position_embedder.ScenePositionEmbedding
scene_embedder_dir: scene_embedder
scene_embedder:
embed_dims: 320
LID: false
can_bus_embedder_cls: dreamforge.networks.can_bus_embedder.CanbusEmbedding
can_bus_embedder_dir: can_bus_embedder
can_bus_embedder:
embed_dims: 768
input_channels: 9
can_bus_norm: true
fix_controlnet: true
dataset:
dataset_type: NuScenesMapDataset
dataset_root: ./data/nuscenes
dataset_process_root: ./data/nuscenes_mmdet3d-12Hz_description/
dataset_cache_file_tag: 8x200x200_12Hz_interp
dataset_cache_dirname: nuscenes_map_aux_12Hz_interp
dataset_cache_file:
- ${..dataset_process_root}../${..dataset_cache_dirname}/train_${..dataset_cache_file_tag}.h5
- ${..dataset_process_root}../${..dataset_cache_dirname}/val_${..dataset_cache_file_tag}.h5
template: A driving scene image at {location}. {description}.
collect_meta_keys:
- camera_intrinsics
- lidar2ego
- lidar2camera
- camera2lidar
- lidar2image
- img_aug_matrix
- camera2ego
- ego2global
collect_meta_lis_keys:
- timeofday
- location
- description
- filename
- token
- ori_shape
image_size:
- 224
- 400
map_bound:
x:
- -50.0
- 50.0
- 0.5
'y':
- -50.0
- 50.0
- 0.5
view_order:
- CAM_FRONT_LEFT
- CAM_FRONT
- CAM_FRONT_RIGHT
- CAM_BACK_RIGHT
- CAM_BACK
- CAM_BACK_LEFT
neighboring_view_pair:
0:
- 5
- 1
1:
- 0
- 2
2:
- 1
- 3
3:
- 2
- 4
4:
- 3
- 5
5:
- 4
- 0
back_resize:
- 896
- 1600
back_pad:
- 0
- 4
- 0
- 0
augment2d:
resize:
- - 0.25
- 0.25
rotate: null
aux_data:
- visibility
- center_offset
- center_ohw
- height
augment3d:
scale:
- 1.0
- 1.0
rotate:
- 0.0
- 0.0
translate: 0
flip_ratio: 0.0
flip_direction: null
object_classes:
- car
- truck
- construction_vehicle
- bus
- trailer
- barrier
- motorcycle
- bicycle
- pedestrian
- traffic_cone
map_classes:
- drivable_area
- ped_crossing
- walkway
- stop_line
- carpark_area
- road_divider
- lane_divider
- road_block
input_modality:
use_lidar: false
use_camera: true
use_radar: false
use_map: false
use_external: false
train_pipeline:
- type: LoadMultiViewImageFromFiles
to_float32: true
- type: LoadAnnotations3D
with_bbox_3d: true
with_label_3d: true
with_attr_label: false
- type: ImageAug3D
final_dim: ${...image_size}
resize_lim: ${...augment2d.resize[0]}
bot_pct_lim:
- 0.0
- 0.0
rot_lim: ${...augment2d.rotate}
rand_flip: false
is_train: false
- type: GlobalRotScaleTrans
resize_lim: ${...augment3d.scale}
rot_lim: ${...augment3d.rotate}
trans_lim: ${...augment3d.translate}
is_train: true
- type: ObjectNameFilter
classes: ${...object_classes}
- type: LoadBEVSegmentation
dataset_root: ${...dataset_root}
xbound: ${...map_bound.x}
ybound: ${...map_bound.y}
classes: ${...map_classes}
object_classes: null
aux_data: null
cache_file: ${...dataset_cache_file.0}
- type: RandomFlip3DwithViews
flip_ratio: ${...augment3d.flip_ratio}
direction: ${...augment3d.flip_direction}
- type: ReorderMultiViewImages
order: ${...view_order}
safe: false
- type: ImageNormalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
- type: DefaultFormatBundle3D
classes: ${...object_classes}
- type: Collect3D
keys:
- img
- gt_bboxes_3d
- gt_labels_3d
- gt_masks_bev
meta_keys: ${...collect_meta_keys}
meta_lis_keys: ${...collect_meta_lis_keys}
test_pipeline:
- type: LoadMultiViewImageFromFiles
to_float32: true
- type: LoadAnnotations3D
with_bbox_3d: true
with_label_3d: true
with_attr_label: false
- type: ImageAug3D
final_dim: ${...image_size}
resize_lim: ${...augment2d.resize[0]}
bot_pct_lim:
- 0.0
- 0.0
rot_lim:
- 0.0
- 0.0
rand_flip: false
is_train: false
- type: GlobalRotScaleTrans
resize_lim: ${...augment3d.scale}
rot_lim: ${...augment3d.rotate}
trans_lim: ${...augment3d.translate}
is_train: true
- type: ObjectNameFilter
classes: ${...object_classes}
- type: LoadBEVSegmentation
dataset_root: ${...dataset_root}
xbound: ${...map_bound.x}
ybound: ${...map_bound.y}
classes: ${...map_classes}
object_classes: null
aux_data: null
cache_file: ${...dataset_cache_file.1}
- type: ReorderMultiViewImages
order: ${...view_order}
safe: false
- type: ImageNormalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
- type: DefaultFormatBundle3D
classes: ${...object_classes}
- type: Collect3D
keys:
- img
- gt_bboxes_3d
- gt_labels_3d
- gt_masks_bev
meta_keys: ${...collect_meta_keys}
meta_lis_keys: ${...collect_meta_lis_keys}
data:
train:
type: ${...dataset_type}
dataset_root: ${...dataset_root}
ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_train.pickle
pipeline: ${...train_pipeline}
object_classes: ${...object_classes}
map_classes: ${...map_classes}
modality: ${...input_modality}
test_mode: false
force_all_boxes: true
box_type_3d: LiDAR
filter_empty_gt: false
video_length: ${model.video_length}
start_on_keyframe: ${dataset.start_on_keyframe}
ref_length: ${model.ref_length}
candidate_length: 5
val:
type: ${...dataset_type}
dataset_root: ${...dataset_root}
ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
pipeline: ${...test_pipeline}
object_classes: ${...object_classes}
map_classes: ${...map_classes}
modality: ${...input_modality}
test_mode: false
force_all_boxes: true
box_type_3d: LiDAR
filter_empty_gt: false
video_length: ${model.video_length}
start_on_keyframe: ${dataset.start_on_keyframe}
ref_length: ${model.ref_length}
candidate_length: 5
test:
type: ${...dataset_type}
dataset_root: ${...dataset_root}
ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
pipeline: ${...test_pipeline}
object_classes: ${...object_classes}
map_classes: ${...map_classes}
modality: ${...input_modality}
test_mode: true
force_all_boxes: true
box_type_3d: LiDAR
filter_empty_gt: false
video_length: ${model.video_length}
start_on_keyframe: ${dataset.start_on_keyframe}
ref_length: ${model.ref_length}
candidate_length: 5
start_on_keyframe: true
accelerator:
gradient_accumulation_steps: 1
mixed_precision: fp16
report_to: tensorboard
runner:
foreground_loss_mode: null
foreground_loss_weight: 0.0
bbox_drop_ratio: 0
bbox_add_ratio: 0
bbox_add_num: 3
keyframe_rate: 1
num_train_epochs: 100
train_batch_size: 1
max_train_steps: null
num_workers: 4
prefetch_factor: 4
display_per_epoch: 40
display_per_n_min: 10
max_grad_norm: 1.0
set_grads_to_none: true
enable_xformers_memory_efficient_attention: true
unet_in_fp16: true
enable_unet_checkpointing: false
enable_controlnet_checkpointing: false
noise_offset: 0.0
train_with_same_offset: true
use_8bit_adam: false
adam_beta1: 0.9
adam_beta2: 0.999
adam_weight_decay: 0.01
adam_epsilon: 1.0e-08
learning_rate: 8.0e-05
lr_scheduler: constant_with_warmup
gradient_accumulation_steps: 1
lr_num_cycles: 1
lr_power: 1.0
lr_warmup_steps: 3000
checkpointing_steps: 10000
validation_steps: 5000
save_model_per_epoch: 10
validation_before_run: true
validation_index:
- 138
- 632
- 1301
- 2342
validation_times: 1
validation_batch_size: 1
validation_show_box: true
validation_seed_global: false
pipeline_param:
guidance_scale: 2
num_inference_steps: 20
eta: 0.0
controlnet_conditioning_scale: 1.0
guess_mode: false
use_zero_map_as_unconditional: false
bbox_max_length: null
init_noise: both
view_order: ${dataset.view_order}
keyframe_rate: 6