Jianbiao commited on
Commit
0fa8de7
·
1 Parent(s): ec6f97a
can_bus_embedder/can_bus_embedder_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f78509a6b1dba2bbf3c094d2705c9ddcb2bcaff1ecf9ef0c4f1d1a78afa7678
3
+ size 1206295
controlnet/config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "BEVControlNetModel",
3
+ "_diffusers_version": "0.17.1",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "bbox_embedder_cls": "dreamforge.networks.bbox_embedder.ContinuousBBoxWithTextEmbedding",
7
+ "bbox_embedder_param": {
8
+ "class_token_dim": 768,
9
+ "embedder_num_freq": 4,
10
+ "minmax_normalize": false,
11
+ "mode": "all-xyz",
12
+ "n_classes": 10,
13
+ "proj_dims": [
14
+ 768,
15
+ 512,
16
+ 512,
17
+ 768
18
+ ],
19
+ "trainable_class_token": false,
20
+ "use_text_encoder_init": true
21
+ },
22
+ "block_out_channels": [
23
+ 320,
24
+ 640,
25
+ 1280,
26
+ 1280
27
+ ],
28
+ "cam_embedder_param": {
29
+ "include_input": true,
30
+ "input_dims": 3,
31
+ "log_sampling": true,
32
+ "num_freqs": 4
33
+ },
34
+ "camera_in_dim": 189,
35
+ "camera_out_dim": 768,
36
+ "canvas_conditioning_channels": 14,
37
+ "class_embed_type": null,
38
+ "conditioning_embedding_out_channels": [
39
+ 16,
40
+ 32,
41
+ 96,
42
+ 256
43
+ ],
44
+ "controlnet_conditioning_channel_order": "rgb",
45
+ "cross_attention_dim": 768,
46
+ "down_block_types": [
47
+ "CrossAttnDownBlock2D",
48
+ "CrossAttnDownBlock2D",
49
+ "CrossAttnDownBlock2D",
50
+ "DownBlock2D"
51
+ ],
52
+ "downsample_padding": 1,
53
+ "drop_cam_num": 6,
54
+ "drop_cam_with_box": false,
55
+ "drop_cond_ratio": 0.25,
56
+ "flip_sin_to_cos": true,
57
+ "freq_shift": 0,
58
+ "global_pool_conditions": false,
59
+ "in_channels": 4,
60
+ "layers_per_block": 2,
61
+ "map_embedder_cls": null,
62
+ "map_embedder_param": null,
63
+ "map_size": [
64
+ 4,
65
+ 200,
66
+ 200
67
+ ],
68
+ "mid_block_scale_factor": 1,
69
+ "norm_eps": 1e-05,
70
+ "norm_num_groups": 32,
71
+ "num_class_embeds": null,
72
+ "only_cross_attention": false,
73
+ "projection_class_embeddings_input_dim": null,
74
+ "resnet_time_scale_shift": "default",
75
+ "uncond_cam_in_dim": [
76
+ 3,
77
+ 7
78
+ ],
79
+ "upcast_attention": false,
80
+ "use_linear_projection": false,
81
+ "use_uncond_map": null,
82
+ "with_layout_canvas": true
83
+ }
controlnet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34524c766706dcac5a2316f8cd2b3b89dd85739b8823be3c5b19144f6223c4e3
3
+ size 1456676573
hydra/config.yaml ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_id: 2.0t_0.3.3_continue
2
+ log_root_prefix: ./work_dirs/dreamforge-t-log
3
+ projname: ${model.name}
4
+ try_run: false
5
+ debug: false
6
+ log_root: ???
7
+ init_method: env://
8
+ seed: 42
9
+ fix_seed_within_batch: false
10
+ resume_from_checkpoint: ./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/
11
+ resume_reset_scheduler: false
12
+ validation_only: false
13
+ model:
14
+ name: SDv1.5mv-rawbox-t
15
+ pretrained_model_name_or_path: ./pretrained/stable-diffusion-v1-5/
16
+ bbox_mode: all-xyz
17
+ bbox_view_shared: false
18
+ crossview_attn_type: basic
19
+ train_with_same_noise: false
20
+ train_with_same_t: true
21
+ runner_module: dreamforge.runner.multiview_t_runner.MultiviewTRunner
22
+ pipe_module: dreamforge.pipeline.pipeline_bev_controlnet_t.StableDiffusionBEVControlNetTPipeline
23
+ unet_module: dreamforge.networks.unet_2d_condition_multiview_st.UNet2DConditionModelMultiviewSceneT
24
+ use_fp32_for_unet_trainable: true
25
+ unet_dir: unet
26
+ unet:
27
+ trainable_state: only_new
28
+ neighboring_view_pair: ${dataset.neighboring_view_pair}
29
+ neighboring_attn_type: add
30
+ zero_module_type: zero_linear
31
+ crossview_attn_type: ${..crossview_attn_type}
32
+ img_size: ${dataset.image_size}
33
+ video_length: ${..video_length}
34
+ temp_pos_emb: learnable
35
+ zero_module_type2: none
36
+ spatial_trainable: true
37
+ with_ref: true
38
+ ref_length: ${..ref_length}
39
+ with_can_bus: true
40
+ with_motion: true
41
+ transformer_type: _ff_last
42
+ model_module: dreamforge.networks.unet_addon_rawbox.BEVControlNetModel
43
+ controlnet_dir: controlnet
44
+ controlnet:
45
+ camera_in_dim: 189
46
+ camera_out_dim: 768
47
+ map_size:
48
+ - 4
49
+ - 200
50
+ - 200
51
+ conditioning_embedding_out_channels:
52
+ - 16
53
+ - 32
54
+ - 96
55
+ - 256
56
+ uncond_cam_in_dim:
57
+ - 3
58
+ - 7
59
+ use_uncond_map: null
60
+ drop_cond_ratio: 0.25
61
+ drop_cam_num: 6
62
+ drop_cam_with_box: false
63
+ cam_embedder_param:
64
+ input_dims: 3
65
+ num_freqs: 4
66
+ include_input: true
67
+ log_sampling: true
68
+ bbox_embedder_cls: dreamforge.networks.bbox_embedder.ContinuousBBoxWithTextEmbedding
69
+ bbox_embedder_param:
70
+ n_classes: 10
71
+ class_token_dim: 768
72
+ trainable_class_token: false
73
+ use_text_encoder_init: true
74
+ embedder_num_freq: 4
75
+ proj_dims:
76
+ - 768
77
+ - 512
78
+ - 512
79
+ - 768
80
+ mode: ${...bbox_mode}
81
+ minmax_normalize: false
82
+ with_layout_canvas: true
83
+ canvas_conditioning_channels: 14
84
+ load_pretrain_from: null
85
+ allow_partial_load: false
86
+ pretrained_dreamforge: ./pretrained/dreamforge-s
87
+ train_with_same_noise_t: false
88
+ video_length: 7
89
+ ref_length: 2
90
+ sc_attn_index:
91
+ - - 0
92
+ - 6
93
+ - 0
94
+ - - 0
95
+ - 6
96
+ - 0
97
+ - - 0
98
+ - 6
99
+ - 1
100
+ - - 0
101
+ - 6
102
+ - 2
103
+ - - 0
104
+ - 6
105
+ - 3
106
+ - - 0
107
+ - 6
108
+ - 4
109
+ - - 0
110
+ - 6
111
+ - 5
112
+ scene_embedder_cls: dreamforge.networks.scene_position_embedder.ScenePositionEmbedding
113
+ scene_embedder_dir: scene_embedder
114
+ scene_embedder:
115
+ embed_dims: 320
116
+ LID: false
117
+ can_bus_embedder_cls: dreamforge.networks.can_bus_embedder.CanbusEmbedding
118
+ can_bus_embedder_dir: can_bus_embedder
119
+ can_bus_embedder:
120
+ embed_dims: 768
121
+ input_channels: 9
122
+ can_bus_norm: true
123
+ fix_controlnet: true
124
+ dataset:
125
+ dataset_type: NuScenesMapDataset
126
+ dataset_root: ./data/nuscenes
127
+ dataset_process_root: ./data/nuscenes_mmdet3d-12Hz_description/
128
+ dataset_cache_file_tag: 8x200x200_12Hz_interp
129
+ dataset_cache_dirname: nuscenes_map_aux_12Hz_interp
130
+ dataset_cache_file:
131
+ - ${..dataset_process_root}../${..dataset_cache_dirname}/train_${..dataset_cache_file_tag}.h5
132
+ - ${..dataset_process_root}../${..dataset_cache_dirname}/val_${..dataset_cache_file_tag}.h5
133
+ template: A driving scene image at {location}. {description}.
134
+ collect_meta_keys:
135
+ - camera_intrinsics
136
+ - lidar2ego
137
+ - lidar2camera
138
+ - camera2lidar
139
+ - lidar2image
140
+ - img_aug_matrix
141
+ - camera2ego
142
+ - ego2global
143
+ collect_meta_lis_keys:
144
+ - timeofday
145
+ - location
146
+ - description
147
+ - filename
148
+ - token
149
+ - ori_shape
150
+ image_size:
151
+ - 224
152
+ - 400
153
+ map_bound:
154
+ x:
155
+ - -50.0
156
+ - 50.0
157
+ - 0.5
158
+ 'y':
159
+ - -50.0
160
+ - 50.0
161
+ - 0.5
162
+ view_order:
163
+ - CAM_FRONT_LEFT
164
+ - CAM_FRONT
165
+ - CAM_FRONT_RIGHT
166
+ - CAM_BACK_RIGHT
167
+ - CAM_BACK
168
+ - CAM_BACK_LEFT
169
+ neighboring_view_pair:
170
+ 0:
171
+ - 5
172
+ - 1
173
+ 1:
174
+ - 0
175
+ - 2
176
+ 2:
177
+ - 1
178
+ - 3
179
+ 3:
180
+ - 2
181
+ - 4
182
+ 4:
183
+ - 3
184
+ - 5
185
+ 5:
186
+ - 4
187
+ - 0
188
+ back_resize:
189
+ - 896
190
+ - 1600
191
+ back_pad:
192
+ - 0
193
+ - 4
194
+ - 0
195
+ - 0
196
+ augment2d:
197
+ resize:
198
+ - - 0.25
199
+ - 0.25
200
+ rotate: null
201
+ aux_data:
202
+ - visibility
203
+ - center_offset
204
+ - center_ohw
205
+ - height
206
+ augment3d:
207
+ scale:
208
+ - 1.0
209
+ - 1.0
210
+ rotate:
211
+ - 0.0
212
+ - 0.0
213
+ translate: 0
214
+ flip_ratio: 0.0
215
+ flip_direction: null
216
+ object_classes:
217
+ - car
218
+ - truck
219
+ - construction_vehicle
220
+ - bus
221
+ - trailer
222
+ - barrier
223
+ - motorcycle
224
+ - bicycle
225
+ - pedestrian
226
+ - traffic_cone
227
+ map_classes:
228
+ - drivable_area
229
+ - ped_crossing
230
+ - walkway
231
+ - stop_line
232
+ - carpark_area
233
+ - road_divider
234
+ - lane_divider
235
+ - road_block
236
+ input_modality:
237
+ use_lidar: false
238
+ use_camera: true
239
+ use_radar: false
240
+ use_map: false
241
+ use_external: false
242
+ train_pipeline:
243
+ - type: LoadMultiViewImageFromFiles
244
+ to_float32: true
245
+ - type: LoadAnnotations3D
246
+ with_bbox_3d: true
247
+ with_label_3d: true
248
+ with_attr_label: false
249
+ - type: ImageAug3D
250
+ final_dim: ${...image_size}
251
+ resize_lim: ${...augment2d.resize[0]}
252
+ bot_pct_lim:
253
+ - 0.0
254
+ - 0.0
255
+ rot_lim: ${...augment2d.rotate}
256
+ rand_flip: false
257
+ is_train: false
258
+ - type: GlobalRotScaleTrans
259
+ resize_lim: ${...augment3d.scale}
260
+ rot_lim: ${...augment3d.rotate}
261
+ trans_lim: ${...augment3d.translate}
262
+ is_train: true
263
+ - type: ObjectNameFilter
264
+ classes: ${...object_classes}
265
+ - type: LoadBEVSegmentation
266
+ dataset_root: ${...dataset_root}
267
+ xbound: ${...map_bound.x}
268
+ ybound: ${...map_bound.y}
269
+ classes: ${...map_classes}
270
+ object_classes: null
271
+ aux_data: null
272
+ cache_file: ${...dataset_cache_file.0}
273
+ - type: RandomFlip3DwithViews
274
+ flip_ratio: ${...augment3d.flip_ratio}
275
+ direction: ${...augment3d.flip_direction}
276
+ - type: ReorderMultiViewImages
277
+ order: ${...view_order}
278
+ safe: false
279
+ - type: ImageNormalize
280
+ mean:
281
+ - 0.5
282
+ - 0.5
283
+ - 0.5
284
+ std:
285
+ - 0.5
286
+ - 0.5
287
+ - 0.5
288
+ - type: DefaultFormatBundle3D
289
+ classes: ${...object_classes}
290
+ - type: Collect3D
291
+ keys:
292
+ - img
293
+ - gt_bboxes_3d
294
+ - gt_labels_3d
295
+ - gt_masks_bev
296
+ meta_keys: ${...collect_meta_keys}
297
+ meta_lis_keys: ${...collect_meta_lis_keys}
298
+ test_pipeline:
299
+ - type: LoadMultiViewImageFromFiles
300
+ to_float32: true
301
+ - type: LoadAnnotations3D
302
+ with_bbox_3d: true
303
+ with_label_3d: true
304
+ with_attr_label: false
305
+ - type: ImageAug3D
306
+ final_dim: ${...image_size}
307
+ resize_lim: ${...augment2d.resize[0]}
308
+ bot_pct_lim:
309
+ - 0.0
310
+ - 0.0
311
+ rot_lim:
312
+ - 0.0
313
+ - 0.0
314
+ rand_flip: false
315
+ is_train: false
316
+ - type: GlobalRotScaleTrans
317
+ resize_lim: ${...augment3d.scale}
318
+ rot_lim: ${...augment3d.rotate}
319
+ trans_lim: ${...augment3d.translate}
320
+ is_train: true
321
+ - type: ObjectNameFilter
322
+ classes: ${...object_classes}
323
+ - type: LoadBEVSegmentation
324
+ dataset_root: ${...dataset_root}
325
+ xbound: ${...map_bound.x}
326
+ ybound: ${...map_bound.y}
327
+ classes: ${...map_classes}
328
+ object_classes: null
329
+ aux_data: null
330
+ cache_file: ${...dataset_cache_file.1}
331
+ - type: ReorderMultiViewImages
332
+ order: ${...view_order}
333
+ safe: false
334
+ - type: ImageNormalize
335
+ mean:
336
+ - 0.5
337
+ - 0.5
338
+ - 0.5
339
+ std:
340
+ - 0.5
341
+ - 0.5
342
+ - 0.5
343
+ - type: DefaultFormatBundle3D
344
+ classes: ${...object_classes}
345
+ - type: Collect3D
346
+ keys:
347
+ - img
348
+ - gt_bboxes_3d
349
+ - gt_labels_3d
350
+ - gt_masks_bev
351
+ meta_keys: ${...collect_meta_keys}
352
+ meta_lis_keys: ${...collect_meta_lis_keys}
353
+ data:
354
+ train:
355
+ type: ${...dataset_type}
356
+ dataset_root: ${...dataset_root}
357
+ ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_train.pickle
358
+ pipeline: ${...train_pipeline}
359
+ object_classes: ${...object_classes}
360
+ map_classes: ${...map_classes}
361
+ modality: ${...input_modality}
362
+ test_mode: false
363
+ force_all_boxes: true
364
+ box_type_3d: LiDAR
365
+ filter_empty_gt: false
366
+ video_length: ${model.video_length}
367
+ start_on_keyframe: ${dataset.start_on_keyframe}
368
+ ref_length: ${model.ref_length}
369
+ candidate_length: 5
370
+ val:
371
+ type: ${...dataset_type}
372
+ dataset_root: ${...dataset_root}
373
+ ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
374
+ pipeline: ${...test_pipeline}
375
+ object_classes: ${...object_classes}
376
+ map_classes: ${...map_classes}
377
+ modality: ${...input_modality}
378
+ test_mode: false
379
+ force_all_boxes: true
380
+ box_type_3d: LiDAR
381
+ filter_empty_gt: false
382
+ video_length: ${model.video_length}
383
+ start_on_keyframe: ${dataset.start_on_keyframe}
384
+ ref_length: ${model.ref_length}
385
+ candidate_length: 5
386
+ test:
387
+ type: ${...dataset_type}
388
+ dataset_root: ${...dataset_root}
389
+ ann_file: ${...dataset_process_root}nuscenes_interp_12Hz_updated_description_val.pickle
390
+ pipeline: ${...test_pipeline}
391
+ object_classes: ${...object_classes}
392
+ map_classes: ${...map_classes}
393
+ modality: ${...input_modality}
394
+ test_mode: true
395
+ force_all_boxes: true
396
+ box_type_3d: LiDAR
397
+ filter_empty_gt: false
398
+ video_length: ${model.video_length}
399
+ start_on_keyframe: ${dataset.start_on_keyframe}
400
+ ref_length: ${model.ref_length}
401
+ candidate_length: 5
402
+ start_on_keyframe: true
403
+ accelerator:
404
+ gradient_accumulation_steps: 1
405
+ mixed_precision: fp16
406
+ report_to: tensorboard
407
+ runner:
408
+ foreground_loss_mode: null
409
+ foreground_loss_weight: 0.0
410
+ bbox_drop_ratio: 0
411
+ bbox_add_ratio: 0
412
+ bbox_add_num: 3
413
+ keyframe_rate: 1
414
+ num_train_epochs: 100
415
+ train_batch_size: 1
416
+ max_train_steps: null
417
+ num_workers: 4
418
+ prefetch_factor: 4
419
+ display_per_epoch: 40
420
+ display_per_n_min: 10
421
+ max_grad_norm: 1.0
422
+ set_grads_to_none: true
423
+ enable_xformers_memory_efficient_attention: true
424
+ unet_in_fp16: true
425
+ enable_unet_checkpointing: false
426
+ enable_controlnet_checkpointing: false
427
+ noise_offset: 0.0
428
+ train_with_same_offset: true
429
+ use_8bit_adam: false
430
+ adam_beta1: 0.9
431
+ adam_beta2: 0.999
432
+ adam_weight_decay: 0.01
433
+ adam_epsilon: 1.0e-08
434
+ learning_rate: 8.0e-05
435
+ lr_scheduler: constant_with_warmup
436
+ gradient_accumulation_steps: 1
437
+ lr_num_cycles: 1
438
+ lr_power: 1.0
439
+ lr_warmup_steps: 3000
440
+ checkpointing_steps: 10000
441
+ validation_steps: 5000
442
+ save_model_per_epoch: 10
443
+ validation_before_run: true
444
+ validation_index:
445
+ - 138
446
+ - 632
447
+ - 1301
448
+ - 2342
449
+ validation_times: 1
450
+ validation_batch_size: 1
451
+ validation_show_box: true
452
+ validation_seed_global: false
453
+ pipeline_param:
454
+ guidance_scale: 2
455
+ num_inference_steps: 20
456
+ eta: 0.0
457
+ controlnet_conditioning_scale: 1.0
458
+ guess_mode: false
459
+ use_zero_map_as_unconditional: false
460
+ bbox_max_length: null
461
+ init_noise: both
462
+ view_order: ${dataset.view_order}
463
+ keyframe_rate: 6
hydra/hydra.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${log_root_prefix}/${projname}_${now:%Y-%m-%d}_${now:%H-%M}_${task_id}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - runner=8gpus_t
116
+ - +exp=dreamforge_t
117
+ - +resume_from_checkpoint=./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/
118
+ - task_id=2.0t_0.3.3_continue
119
+ job:
120
+ name: train
121
+ chdir: null
122
+ override_dirname: +exp=dreamforge_t,+resume_from_checkpoint=./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/,runner=8gpus_t,task_id=2.0t_0.3.3_continue
123
+ id: ???
124
+ num: ???
125
+ config_name: config_single
126
+ env_set: {}
127
+ env_copy: []
128
+ config:
129
+ override_dirname:
130
+ kv_sep: '='
131
+ item_sep: ','
132
+ exclude_keys: []
133
+ runtime:
134
+ version: 1.3.0
135
+ version_base: '1.3'
136
+ cwd: /path/to/DreamForge
137
+ config_sources:
138
+ - path: hydra.conf
139
+ schema: pkg
140
+ provider: hydra
141
+ - path: /path/to/DreamForge/configs
142
+ schema: file
143
+ provider: main
144
+ - path: ''
145
+ schema: structured
146
+ provider: schema
147
+ output_dir: /path/to/DreamForge/work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-09-03_10-42_2.0t_0.3.3_continue
148
+ choices:
149
+ exp: dreamforge_t
150
+ exp/model@model: ../../model/SDv1.5mv_rawbox_t
151
+ runner: 8gpus_t
152
+ accelerator: default
153
+ dataset: Nuscenes_cache
154
+ model: SDv1.5mv_rawbox
155
+ hydra/env: default
156
+ hydra/callbacks: null
157
+ hydra/job_logging: default
158
+ hydra/hydra_logging: default
159
+ hydra/hydra_help: default
160
+ hydra/help: default
161
+ hydra/sweeper: basic
162
+ hydra/launcher: basic
163
+ hydra/output: default
164
+ verbose: false
hydra/overrides.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ - runner=8gpus_t
2
+ - +exp=dreamforge_t
3
+ - +resume_from_checkpoint=./work_dirs/dreamforge-t-log/SDv1.5mv-rawbox-t_2024-08-30_19-09_2.0t_0.3.3/checkpoint-150000/
4
+ - task_id=2.0t_0.3.3_continue
scene_embedder/scene_embedder_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293016fa036b7d36c8c71e588c4946be12acdeef1bc772d8f309992b6c82684a
3
+ size 9184513
unet/config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModelMultiviewSceneT",
3
+ "_diffusers_version": "0.17.1",
4
+ "_name_or_path": "./pretrained/dreamforge-s",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "attention_head_dim": 8,
9
+ "attn1_q_trainable": true,
10
+ "block_out_channels": [
11
+ 320,
12
+ 640,
13
+ 1280,
14
+ 1280
15
+ ],
16
+ "center_input_sample": false,
17
+ "class_embed_type": null,
18
+ "class_embeddings_concat": false,
19
+ "conv_in_kernel": 3,
20
+ "conv_out_kernel": 3,
21
+ "cross_attention_dim": 768,
22
+ "cross_attention_norm": null,
23
+ "crossview_attn_type": "basic",
24
+ "down_block_types": [
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "CrossAttnDownBlock2D",
28
+ "DownBlock2D"
29
+ ],
30
+ "downsample_padding": 1,
31
+ "dual_cross_attention": false,
32
+ "encoder_hid_dim": null,
33
+ "encoder_hid_dim_type": null,
34
+ "epipolar_mask_type": "binary",
35
+ "flip_sin_to_cos": true,
36
+ "freq_shift": 0,
37
+ "img_size": [
38
+ 224,
39
+ 400
40
+ ],
41
+ "in_channels": 4,
42
+ "layers_per_block": 2,
43
+ "mid_block_only_cross_attention": null,
44
+ "mid_block_scale_factor": 1,
45
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
46
+ "neighboring_attn_type": "add",
47
+ "neighboring_view_pair": {
48
+ "0": [
49
+ 5,
50
+ 1
51
+ ],
52
+ "1": [
53
+ 0,
54
+ 2
55
+ ],
56
+ "2": [
57
+ 1,
58
+ 3
59
+ ],
60
+ "3": [
61
+ 2,
62
+ 4
63
+ ],
64
+ "4": [
65
+ 3,
66
+ 5
67
+ ],
68
+ "5": [
69
+ 4,
70
+ 0
71
+ ]
72
+ },
73
+ "norm_eps": 1e-05,
74
+ "norm_num_groups": 32,
75
+ "num_class_embeds": null,
76
+ "only_cross_attention": false,
77
+ "out_channels": 4,
78
+ "projection_class_embeddings_input_dim": null,
79
+ "ref_length": 2,
80
+ "resnet_out_scale_factor": 1.0,
81
+ "resnet_skip_time_act": false,
82
+ "resnet_time_scale_shift": "default",
83
+ "sample_size": 64,
84
+ "scene_channels": 320,
85
+ "spatial_trainable": true,
86
+ "temp_pos_emb": "learnable",
87
+ "time_cond_proj_dim": null,
88
+ "time_embedding_act_fn": null,
89
+ "time_embedding_dim": null,
90
+ "time_embedding_type": "positional",
91
+ "timestep_post_act": null,
92
+ "trainable_state": "only_new",
93
+ "transformer_type": "_ff_last",
94
+ "up_block_types": [
95
+ "UpBlock2D",
96
+ "CrossAttnUpBlock2D",
97
+ "CrossAttnUpBlock2D",
98
+ "CrossAttnUpBlock2D"
99
+ ],
100
+ "upcast_attention": false,
101
+ "use_linear_projection": false,
102
+ "video_length": 7,
103
+ "with_can_bus": true,
104
+ "with_motion": true,
105
+ "with_ref": true,
106
+ "zero_module_type": "zero_linear",
107
+ "zero_module_type2": "none"
108
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e0a0d85014e8680666672102e71e030539138e5f4aacf2aae22568ccb1a577
3
+ size 2770299339