ameerazam08 commited on
Commit
210e8a2
·
verified ·
1 Parent(s): 12e7a68

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +22 -0
  2. .gitmodules +6 -0
  3. README.md +153 -0
  4. arguments/__init__.py +118 -0
  5. assets/main.png +0 -0
  6. data/.gitkeep +0 -0
  7. data_utils/deepspeech_features/README.md +20 -0
  8. data_utils/deepspeech_features/deepspeech_features.py +274 -0
  9. data_utils/deepspeech_features/deepspeech_store.py +172 -0
  10. data_utils/deepspeech_features/extract_ds_features.py +130 -0
  11. data_utils/deepspeech_features/extract_wav.py +87 -0
  12. data_utils/deepspeech_features/fea_win.py +11 -0
  13. data_utils/easyportrait/create_teeth_mask.py +34 -0
  14. data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_1024x1024.py +59 -0
  15. data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_384x384.py +59 -0
  16. data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_512x512.py +59 -0
  17. data_utils/easyportrait/local_configs/__base__/default_runtime.py +14 -0
  18. data_utils/easyportrait/local_configs/__base__/models/bisenetv2.py +80 -0
  19. data_utils/easyportrait/local_configs/__base__/models/fcn_resnet50.py +45 -0
  20. data_utils/easyportrait/local_configs/__base__/models/fpn_resnet50.py +36 -0
  21. data_utils/easyportrait/local_configs/__base__/models/lraspp.py +25 -0
  22. data_utils/easyportrait/local_configs/__base__/models/segformer.py +34 -0
  23. data_utils/easyportrait/local_configs/__base__/schedules/schedule_10k_adamw.py +11 -0
  24. data_utils/easyportrait/local_configs/__base__/schedules/schedule_160k_adamw.py +9 -0
  25. data_utils/easyportrait/local_configs/__base__/schedules/schedule_20k_adamw.py +11 -0
  26. data_utils/easyportrait/local_configs/__base__/schedules/schedule_40k_adamw.py +9 -0
  27. data_utils/easyportrait/local_configs/__base__/schedules/schedule_80k_adamw.py +9 -0
  28. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/bisenet-fp/bisenetv2-fp.py +221 -0
  29. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/bisenet-ps/bisenetv2-ps.py +218 -0
  30. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/danet-fp/danet-fp.py +174 -0
  31. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/danet-ps/danet-ps.py +171 -0
  32. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/deeplab-fp/deeplabv3-fp.py +174 -0
  33. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/deeplab-ps/deeplabv3-ps.py +171 -0
  34. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fastscnn-fp/fastscnn-fp.py +165 -0
  35. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fastscnn-ps/fastscnn-ps.py +162 -0
  36. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fcn-fp/fcn-fp.py +187 -0
  37. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fcn-ps/fcn-ps.py +184 -0
  38. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fpn-fp/fpn-fp.py +182 -0
  39. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fpn-ps/fpn-ps.py +179 -0
  40. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/segformer-fp/segformer-fp.py +182 -0
  41. data_utils/easyportrait/local_configs/easyportrait_experiments_v2/segformer-ps/segformer-ps.py +179 -0
  42. data_utils/easyportrait/mmseg/.mim/configs +0 -0
  43. data_utils/easyportrait/mmseg/.mim/tools +0 -0
  44. data_utils/easyportrait/mmseg/__init__.py +62 -0
  45. data_utils/easyportrait/mmseg/apis/__init__.py +11 -0
  46. data_utils/easyportrait/mmseg/apis/inference.py +145 -0
  47. data_utils/easyportrait/mmseg/apis/test.py +233 -0
  48. data_utils/easyportrait/mmseg/apis/train.py +194 -0
  49. data_utils/easyportrait/mmseg/core/__init__.py +12 -0
  50. data_utils/easyportrait/mmseg/core/builder.py +33 -0
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ build/
3
+ *.egg-info/
4
+ *.so
5
+ *.mp4
6
+ *.pth
7
+
8
+ data_utils/face_tracking/3DMM/*
9
+ data_utils/face_parsing/79999_iter.pth
10
+
11
+ *.pyc
12
+ .vscode
13
+ output*
14
+ build
15
+ gridencoder/gridencoder.egg-info
16
+ diff_rasterization/diff_rast.egg-info
17
+ diff_rasterization/dist
18
+ tensorboard_3d
19
+ screenshots
20
+
21
+ data/*
22
+ !*.gitkeep
.gitmodules ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [submodule "submodules/simple-knn"]
2
+ path = submodules/simple-knn
3
+ url = https://gitlab.inria.fr/bkerbl/simple-knn.git
4
+ [submodule "submodules/diff-gaussian-rasterization"]
5
+ path = submodules/diff-gaussian-rasterization
6
+ url = https://github.com/ashawkey/diff-gaussian-rasterization.git
README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via Gaussian Splatting
2
+
3
+ This is the official repository for our ECCV 2024 paper **TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via Gaussian Splatting**.
4
+
5
+ [Paper](https://arxiv.org/abs/2404.15264) | [Project](https://fictionarry.github.io/TalkingGaussian/) | [Video](https://youtu.be/c5VG7HkDs8I)
6
+
7
+ ![image](./assets/main.png)
8
+
9
+
10
+ ## Installation
11
+
12
+ Tested on Ubuntu 18.04, CUDA 11.3, PyTorch 1.12.1
13
+
14
+ ```
15
+ git clone [email protected]:Fictionarry/TalkingGaussian.git --recursive
16
+
17
+ conda env create --file environment.yml
18
+ conda activate talking_gaussian
19
+ pip install "git+https://github.com/facebookresearch/pytorch3d.git"
20
+ pip install tensorflow-gpu==2.8.0
21
+ ```
22
+
23
+ If encounter installation problem from the `diff-gaussian-rasterization` or `gridencoder`, please refer to [gaussian-splatting](https://github.com/graphdeco-inria/gaussian-splatting) and [torch-ngp](https://github.com/ashawkey/torch-ngp).
24
+
25
+ ### Preparation
26
+
27
+ - Prepare face-parsing model and the 3DMM model for head pose estimation.
28
+
29
+ ```bash
30
+ bash scripts/prepare.sh
31
+ ```
32
+
33
+ - Download 3DMM model from [Basel Face Model 2009](https://faces.dmi.unibas.ch/bfm/main.php?nav=1-1-0&id=details):
34
+
35
+ ```bash
36
+ # 1. copy 01_MorphableModel.mat to data_util/face_tracking/3DMM/
37
+ # 2. run following
38
+ cd data_utils/face_tracking
39
+ python convert_BFM.py
40
+ ```
41
+
42
+ - Prepare the environment for [EasyPortrait](https://github.com/hukenovs/easyportrait):
43
+
44
+ ```bash
45
+ # prepare mmcv
46
+ conda activate talking_gaussian
47
+ pip install -U openmim
48
+ mim install mmcv-full==1.7.1
49
+
50
+ # download model weight
51
+ cd data_utils/easyportrait
52
+ wget "https://n-ws-620xz-pd11.s3pd11.sbercloud.ru/b-ws-620xz-pd11-jux/easyportrait/experiments/models/fpn-fp-512.pth"
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ### Important Notice
58
+
59
+ - This code is provided for research purposes only. The author makes no warranties, express or implied, as to the accuracy, completeness, or fitness for a particular purpose of the code. Use this code at your own risk.
60
+
61
+ - The author explicitly prohibits the use of this code for any malicious or illegal activities. By using this code, you agree to comply with all applicable laws and regulations, and you agree not to use it to harm others or to perform any actions that would be considered unethical or illegal.
62
+
63
+ - The author will not be responsible for any damages, losses, or issues that arise from the use of this code.
64
+
65
+ - Users are encouraged to use this code responsibly and ethically.
66
+
67
+ ### Video Dataset
68
+ [Here](https://drive.google.com/drive/folders/1E_8W805lioIznqbkvTQHWWi5IFXUG7Er?usp=drive_link) we provide two video clips used in our experiments, which are captured from YouTube. Please respect the original content creators' rights and comply with YouTube’s copyright policies in the usage.
69
+
70
+ Other used videos can be found from [GeneFace](https://github.com/yerfor/GeneFace) and [AD-NeRF](https://github.com/YudongGuo/AD-NeRF).
71
+
72
+
73
+ ### Pre-processing Training Video
74
+
75
+ * Put training video under `data/<ID>/<ID>.mp4`.
76
+
77
+ The video **must be 25FPS, with all frames containing the talking person**.
78
+ The resolution should be about 512x512, and duration about 1-5 min.
79
+
80
+ * Run script to process the video.
81
+
82
+ ```bash
83
+ python data_utils/process.py data/<ID>/<ID>.mp4
84
+ ```
85
+
86
+ * Obtain Action Units
87
+
88
+ Run `FeatureExtraction` in [OpenFace](https://github.com/TadasBaltrusaitis/OpenFace), rename and move the output CSV file to `data/<ID>/au.csv`.
89
+
90
+ * Generate tooth masks
91
+
92
+ ```bash
93
+ export PYTHONPATH=./data_utils/easyportrait
94
+ python ./data_utils/easyportrait/create_teeth_mask.py ./data/<ID>
95
+ ```
96
+
97
+ ### Audio Pre-process
98
+
99
+ In our paper, we use DeepSpeech features for evaluation.
100
+
101
+ * DeepSpeech
102
+
103
+ ```bash
104
+ python data_utils/deepspeech_features/extract_ds_features.py --input data/<name>.wav # saved to data/<name>.npy
105
+ ```
106
+
107
+ - HuBERT
108
+
109
+ Similar to ER-NeRF, HuBERT is also available. Recommended for situations if the audio is not in English.
110
+
111
+ Specify `--audio_extractor hubert` when training and testing.
112
+
113
+ ```
114
+ python data_utils/hubert.py --wav data/<name>.wav # save to data/<name>_hu.npy
115
+ ```
116
+
117
+ ### Train
118
+
119
+ ```bash
120
+ # If resources are sufficient, partially parallel is available to speed up the training. See the script.
121
+ bash scripts/train_xx.sh data/<ID> output/<project_name> <GPU_ID>
122
+ ```
123
+
124
+ ### Test
125
+
126
+ ```bash
127
+ # saved to output/<project_name>/test/ours_None/renders
128
+ python synthesize_fuse.py -S data/<ID> -M output/<project_name> --eval
129
+ ```
130
+
131
+ ### Inference with target audio
132
+
133
+ ```bash
134
+ python synthesize_fuse.py -S data/<ID> -M output/<project_name> --use_train --audio <preprocessed_audio_feature>.npy
135
+ ```
136
+
137
+ ## Citation
138
+
139
+ Consider citing as below if you find this repository helpful to your project:
140
+
141
+ ```
142
+ @article{li2024talkinggaussian,
143
+ title={TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via Gaussian Splatting},
144
+ author={Jiahe Li and Jiawei Zhang and Xiao Bai and Jin Zheng and Xin Ning and Jun Zhou and Lin Gu},
145
+ journal={arXiv preprint arXiv:2404.15264},
146
+ year={2024}
147
+ }
148
+ ```
149
+
150
+
151
+ ## Acknowledgement
152
+
153
+ This code is developed on [gaussian-splatting](https://github.com/graphdeco-inria/gaussian-splatting) with [simple-knn](https://gitlab.inria.fr/bkerbl/simple-knn), and a modified [diff-gaussian-rasterization](https://github.com/ashawkey/diff-gaussian-rasterization). Partial codes are from [RAD-NeRF](https://github.com/ashawkey/RAD-NeRF), [DFRF](https://github.com/sstzal/DFRF), [GeneFace](https://github.com/yerfor/GeneFace), and [AD-NeRF](https://github.com/YudongGuo/AD-NeRF). Teeth mask is from [EasyPortrait](https://github.com/hukenovs/easyportrait). Thanks for these great projects!
arguments/__init__.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (C) 2023, Inria
3
+ # GRAPHDECO research group, https://team.inria.fr/graphdeco
4
+ # All rights reserved.
5
+ #
6
+ # This software is free for non-commercial, research and evaluation use
7
+ # under the terms of the LICENSE.md file.
8
+ #
9
+ # For inquiries contact [email protected]
10
+ #
11
+
12
+ from argparse import ArgumentParser, Namespace
13
+ import sys
14
+ import os
15
+
16
+ class GroupParams:
17
+ pass
18
+
19
+ class ParamGroup:
20
+ def __init__(self, parser: ArgumentParser, name : str, fill_none = False):
21
+ group = parser.add_argument_group(name)
22
+ for key, value in vars(self).items():
23
+ shorthand = False
24
+ if key.startswith("_"):
25
+ shorthand = True
26
+ key = key[1:]
27
+ t = type(value)
28
+ value = value if not fill_none else None
29
+ if shorthand:
30
+ if t == bool:
31
+ group.add_argument("--" + key, ("-" + key[0:1]), ("-" + key[0:1].upper()), default=value, action="store_true")
32
+ else:
33
+ group.add_argument("--" + key, ("-" + key[0:1]), ("-" + key[0:1].upper()), default=value, type=t)
34
+ else:
35
+ if t == bool:
36
+ group.add_argument("--" + key, default=value, action="store_true")
37
+ else:
38
+ group.add_argument("--" + key, default=value, type=t)
39
+
40
+ def extract(self, args):
41
+ group = GroupParams()
42
+ for arg in vars(args).items():
43
+ if arg[0] in vars(self) or ("_" + arg[0]) in vars(self):
44
+ setattr(group, arg[0], arg[1])
45
+ return group
46
+
47
+ class ModelParams(ParamGroup):
48
+ def __init__(self, parser, sentinel=False):
49
+ self.sh_degree = 2
50
+ self._source_path = ""
51
+ self._model_path = ""
52
+ self._images = "images"
53
+ self._resolution = -1
54
+ self._white_background = False
55
+ self.data_device = "cpu"
56
+ self.eval = False
57
+ self.audio = ""
58
+ self.init_num = 10_000
59
+ self.audio_extractor = "deepspeech"
60
+ super().__init__(parser, "Loading Parameters", sentinel)
61
+
62
+ def extract(self, args):
63
+ g = super().extract(args)
64
+ g.source_path = os.path.abspath(g.source_path)
65
+
66
+ return g
67
+
68
+ class PipelineParams(ParamGroup):
69
+ def __init__(self, parser):
70
+ self.convert_SHs_python = False
71
+ self.compute_cov3D_python = False
72
+ self.debug = False
73
+ super().__init__(parser, "Pipeline Parameters")
74
+
75
+ class OptimizationParams(ParamGroup):
76
+ def __init__(self, parser):
77
+ self.iterations = 50_000
78
+ self.position_lr_init = 0.00016
79
+ self.position_lr_final = 0.0000016
80
+ self.position_lr_delay_mult = 0.01
81
+ self.position_lr_max_steps = 45_000
82
+ self.feature_lr = 0.0025
83
+ self.opacity_lr = 0.05
84
+ self.scaling_lr = 0.003
85
+ self.rotation_lr = 0.001
86
+ self.percent_dense = 0.005
87
+ self.lambda_dssim = 0.2
88
+ self.densification_interval = 100
89
+ self.opacity_reset_interval = 3000
90
+ self.densify_from_iter = 500
91
+
92
+
93
+ self.densify_until_iter = 45_000
94
+ self.densify_grad_threshold = 0.0002
95
+ self.random_background = False
96
+ super().__init__(parser, "Optimization Parameters")
97
+
98
+ def get_combined_args(parser : ArgumentParser):
99
+ cmdlne_string = sys.argv[1:]
100
+ cfgfile_string = "Namespace()"
101
+ args_cmdline = parser.parse_args(cmdlne_string)
102
+
103
+ try:
104
+ cfgfilepath = os.path.join(args_cmdline.model_path, "cfg_args")
105
+ print("Looking for config file in", cfgfilepath)
106
+ with open(cfgfilepath) as cfg_file:
107
+ print("Config file found: {}".format(cfgfilepath))
108
+ cfgfile_string = cfg_file.read()
109
+ except TypeError:
110
+ print("Config file not found at")
111
+ pass
112
+ args_cfgfile = eval(cfgfile_string)
113
+
114
+ merged_dict = vars(args_cfgfile).copy()
115
+ for k,v in vars(args_cmdline).items():
116
+ if v != None:
117
+ merged_dict[k] = v
118
+ return Namespace(**merged_dict)
assets/main.png ADDED
data/.gitkeep ADDED
File without changes
data_utils/deepspeech_features/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Routines for DeepSpeech features processing
2
+ Several routines for [DeepSpeech](https://github.com/mozilla/DeepSpeech) features processing, like speech features generation for [VOCA](https://github.com/TimoBolkart/voca) model.
3
+
4
+ ## Installation
5
+
6
+ ```
7
+ pip3 install -r requirements.txt
8
+ ```
9
+
10
+ ## Usage
11
+
12
+ Generate wav files:
13
+ ```
14
+ python3 extract_wav.py --in-video=<you_data_dir>
15
+ ```
16
+
17
+ Generate files with DeepSpeech features:
18
+ ```
19
+ python3 extract_ds_features.py --input=<you_data_dir>
20
+ ```
data_utils/deepspeech_features/deepspeech_features.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DeepSpeech features processing routines.
3
+ NB: Based on VOCA code. See the corresponding license restrictions.
4
+ """
5
+
6
+ __all__ = ['conv_audios_to_deepspeech']
7
+
8
+ import numpy as np
9
+ import warnings
10
+ import resampy
11
+ from scipy.io import wavfile
12
+ from python_speech_features import mfcc
13
+ import tensorflow.compat.v1 as tf
14
+ tf.disable_v2_behavior()
15
+
16
+ def conv_audios_to_deepspeech(audios,
17
+ out_files,
18
+ num_frames_info,
19
+ deepspeech_pb_path,
20
+ audio_window_size=1,
21
+ audio_window_stride=1):
22
+ """
23
+ Convert list of audio files into files with DeepSpeech features.
24
+
25
+ Parameters
26
+ ----------
27
+ audios : list of str or list of None
28
+ Paths to input audio files.
29
+ out_files : list of str
30
+ Paths to output files with DeepSpeech features.
31
+ num_frames_info : list of int
32
+ List of numbers of frames.
33
+ deepspeech_pb_path : str
34
+ Path to DeepSpeech 0.1.0 frozen model.
35
+ audio_window_size : int, default 16
36
+ Audio window size.
37
+ audio_window_stride : int, default 1
38
+ Audio window stride.
39
+ """
40
+ graph, logits_ph, input_node_ph, input_lengths_ph = prepare_deepspeech_net(
41
+ deepspeech_pb_path)
42
+
43
+ with tf.compat.v1.Session(graph=graph) as sess:
44
+ for audio_file_path, out_file_path, num_frames in zip(audios, out_files, num_frames_info):
45
+ print(audio_file_path)
46
+ print(out_file_path)
47
+ audio_sample_rate, audio = wavfile.read(audio_file_path)
48
+ if audio.ndim != 1:
49
+ warnings.warn(
50
+ "Audio has multiple channels, the first channel is used")
51
+ audio = audio[:, 0]
52
+ ds_features = pure_conv_audio_to_deepspeech(
53
+ audio=audio,
54
+ audio_sample_rate=audio_sample_rate,
55
+ audio_window_size=audio_window_size,
56
+ audio_window_stride=audio_window_stride,
57
+ num_frames=num_frames,
58
+ net_fn=lambda x: sess.run(
59
+ logits_ph,
60
+ feed_dict={
61
+ input_node_ph: x[np.newaxis, ...],
62
+ input_lengths_ph: [x.shape[0]]}))
63
+
64
+ net_output = ds_features.reshape(-1, 29)
65
+ win_size = 16
66
+ zero_pad = np.zeros((int(win_size / 2), net_output.shape[1]))
67
+ net_output = np.concatenate(
68
+ (zero_pad, net_output, zero_pad), axis=0)
69
+ windows = []
70
+ for window_index in range(0, net_output.shape[0] - win_size, 2):
71
+ windows.append(
72
+ net_output[window_index:window_index + win_size])
73
+ print(np.array(windows).shape)
74
+ np.save(out_file_path, np.array(windows))
75
+
76
+
77
+ def prepare_deepspeech_net(deepspeech_pb_path):
78
+ """
79
+ Load and prepare DeepSpeech network.
80
+
81
+ Parameters
82
+ ----------
83
+ deepspeech_pb_path : str
84
+ Path to DeepSpeech 0.1.0 frozen model.
85
+
86
+ Returns
87
+ -------
88
+ graph : obj
89
+ ThensorFlow graph.
90
+ logits_ph : obj
91
+ ThensorFlow placeholder for `logits`.
92
+ input_node_ph : obj
93
+ ThensorFlow placeholder for `input_node`.
94
+ input_lengths_ph : obj
95
+ ThensorFlow placeholder for `input_lengths`.
96
+ """
97
+ # Load graph and place_holders:
98
+ with tf.io.gfile.GFile(deepspeech_pb_path, "rb") as f:
99
+ graph_def = tf.compat.v1.GraphDef()
100
+ graph_def.ParseFromString(f.read())
101
+
102
+ graph = tf.compat.v1.get_default_graph()
103
+ tf.import_graph_def(graph_def, name="deepspeech")
104
+ logits_ph = graph.get_tensor_by_name("deepspeech/logits:0")
105
+ input_node_ph = graph.get_tensor_by_name("deepspeech/input_node:0")
106
+ input_lengths_ph = graph.get_tensor_by_name("deepspeech/input_lengths:0")
107
+
108
+ return graph, logits_ph, input_node_ph, input_lengths_ph
109
+
110
+
111
+ def pure_conv_audio_to_deepspeech(audio,
112
+ audio_sample_rate,
113
+ audio_window_size,
114
+ audio_window_stride,
115
+ num_frames,
116
+ net_fn):
117
+ """
118
+ Core routine for converting audion into DeepSpeech features.
119
+
120
+ Parameters
121
+ ----------
122
+ audio : np.array
123
+ Audio data.
124
+ audio_sample_rate : int
125
+ Audio sample rate.
126
+ audio_window_size : int
127
+ Audio window size.
128
+ audio_window_stride : int
129
+ Audio window stride.
130
+ num_frames : int or None
131
+ Numbers of frames.
132
+ net_fn : func
133
+ Function for DeepSpeech model call.
134
+
135
+ Returns
136
+ -------
137
+ np.array
138
+ DeepSpeech features.
139
+ """
140
+ target_sample_rate = 16000
141
+ if audio_sample_rate != target_sample_rate:
142
+ resampled_audio = resampy.resample(
143
+ x=audio.astype(np.float),
144
+ sr_orig=audio_sample_rate,
145
+ sr_new=target_sample_rate)
146
+ else:
147
+ resampled_audio = audio.astype(np.float32)
148
+ input_vector = conv_audio_to_deepspeech_input_vector(
149
+ audio=resampled_audio.astype(np.int16),
150
+ sample_rate=target_sample_rate,
151
+ num_cepstrum=26,
152
+ num_context=9)
153
+
154
+ network_output = net_fn(input_vector)
155
+ # print(network_output.shape)
156
+
157
+ deepspeech_fps = 50
158
+ video_fps = 50 # Change this option if video fps is different
159
+ audio_len_s = float(audio.shape[0]) / audio_sample_rate
160
+ if num_frames is None:
161
+ num_frames = int(round(audio_len_s * video_fps))
162
+ else:
163
+ video_fps = num_frames / audio_len_s
164
+ network_output = interpolate_features(
165
+ features=network_output[:, 0],
166
+ input_rate=deepspeech_fps,
167
+ output_rate=video_fps,
168
+ output_len=num_frames)
169
+
170
+ # Make windows:
171
+ zero_pad = np.zeros((int(audio_window_size / 2), network_output.shape[1]))
172
+ network_output = np.concatenate(
173
+ (zero_pad, network_output, zero_pad), axis=0)
174
+ windows = []
175
+ for window_index in range(0, network_output.shape[0] - audio_window_size, audio_window_stride):
176
+ windows.append(
177
+ network_output[window_index:window_index + audio_window_size])
178
+
179
+ return np.array(windows)
180
+
181
+
182
+ def conv_audio_to_deepspeech_input_vector(audio,
183
+ sample_rate,
184
+ num_cepstrum,
185
+ num_context):
186
+ """
187
+ Convert audio raw data into DeepSpeech input vector.
188
+
189
+ Parameters
190
+ ----------
191
+ audio : np.array
192
+ Audio data.
193
+ audio_sample_rate : int
194
+ Audio sample rate.
195
+ num_cepstrum : int
196
+ Number of cepstrum.
197
+ num_context : int
198
+ Number of context.
199
+
200
+ Returns
201
+ -------
202
+ np.array
203
+ DeepSpeech input vector.
204
+ """
205
+ # Get mfcc coefficients:
206
+ features = mfcc(
207
+ signal=audio,
208
+ samplerate=sample_rate,
209
+ numcep=num_cepstrum)
210
+
211
+ # We only keep every second feature (BiRNN stride = 2):
212
+ features = features[::2]
213
+
214
+ # One stride per time step in the input:
215
+ num_strides = len(features)
216
+
217
+ # Add empty initial and final contexts:
218
+ empty_context = np.zeros((num_context, num_cepstrum), dtype=features.dtype)
219
+ features = np.concatenate((empty_context, features, empty_context))
220
+
221
+ # Create a view into the array with overlapping strides of size
222
+ # numcontext (past) + 1 (present) + numcontext (future):
223
+ window_size = 2 * num_context + 1
224
+ train_inputs = np.lib.stride_tricks.as_strided(
225
+ features,
226
+ shape=(num_strides, window_size, num_cepstrum),
227
+ strides=(features.strides[0],
228
+ features.strides[0], features.strides[1]),
229
+ writeable=False)
230
+
231
+ # Flatten the second and third dimensions:
232
+ train_inputs = np.reshape(train_inputs, [num_strides, -1])
233
+
234
+ train_inputs = np.copy(train_inputs)
235
+ train_inputs = (train_inputs - np.mean(train_inputs)) / \
236
+ np.std(train_inputs)
237
+
238
+ return train_inputs
239
+
240
+
241
+ def interpolate_features(features,
242
+ input_rate,
243
+ output_rate,
244
+ output_len):
245
+ """
246
+ Interpolate DeepSpeech features.
247
+
248
+ Parameters
249
+ ----------
250
+ features : np.array
251
+ DeepSpeech features.
252
+ input_rate : int
253
+ input rate (FPS).
254
+ output_rate : int
255
+ Output rate (FPS).
256
+ output_len : int
257
+ Output data length.
258
+
259
+ Returns
260
+ -------
261
+ np.array
262
+ Interpolated data.
263
+ """
264
+ input_len = features.shape[0]
265
+ num_features = features.shape[1]
266
+ input_timestamps = np.arange(input_len) / float(input_rate)
267
+ output_timestamps = np.arange(output_len) / float(output_rate)
268
+ output_features = np.zeros((output_len, num_features))
269
+ for feature_idx in range(num_features):
270
+ output_features[:, feature_idx] = np.interp(
271
+ x=output_timestamps,
272
+ xp=input_timestamps,
273
+ fp=features[:, feature_idx])
274
+ return output_features
data_utils/deepspeech_features/deepspeech_store.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Routines for loading DeepSpeech model.
3
+ """
4
+
5
+ __all__ = ['get_deepspeech_model_file']
6
+
7
+ import os
8
+ import zipfile
9
+ import logging
10
+ import hashlib
11
+
12
+
13
+ deepspeech_features_repo_url = 'https://github.com/osmr/deepspeech_features'
14
+
15
+
16
+ def get_deepspeech_model_file(local_model_store_dir_path=os.path.join("~", ".tensorflow", "models")):
17
+ """
18
+ Return location for the pretrained on local file system. This function will download from online model zoo when
19
+ model cannot be found or has mismatch. The root directory will be created if it doesn't exist.
20
+
21
+ Parameters
22
+ ----------
23
+ local_model_store_dir_path : str, default $TENSORFLOW_HOME/models
24
+ Location for keeping the model parameters.
25
+
26
+ Returns
27
+ -------
28
+ file_path
29
+ Path to the requested pretrained model file.
30
+ """
31
+ sha1_hash = "b90017e816572ddce84f5843f1fa21e6a377975e"
32
+ file_name = "deepspeech-0_1_0-b90017e8.pb"
33
+ local_model_store_dir_path = os.path.expanduser(local_model_store_dir_path)
34
+ file_path = os.path.join(local_model_store_dir_path, file_name)
35
+ if os.path.exists(file_path):
36
+ if _check_sha1(file_path, sha1_hash):
37
+ return file_path
38
+ else:
39
+ logging.warning("Mismatch in the content of model file detected. Downloading again.")
40
+ else:
41
+ logging.info("Model file not found. Downloading to {}.".format(file_path))
42
+
43
+ if not os.path.exists(local_model_store_dir_path):
44
+ os.makedirs(local_model_store_dir_path)
45
+
46
+ zip_file_path = file_path + ".zip"
47
+ _download(
48
+ url="{repo_url}/releases/download/{repo_release_tag}/{file_name}.zip".format(
49
+ repo_url=deepspeech_features_repo_url,
50
+ repo_release_tag="v0.0.1",
51
+ file_name=file_name),
52
+ path=zip_file_path,
53
+ overwrite=True)
54
+ with zipfile.ZipFile(zip_file_path) as zf:
55
+ zf.extractall(local_model_store_dir_path)
56
+ os.remove(zip_file_path)
57
+
58
+ if _check_sha1(file_path, sha1_hash):
59
+ return file_path
60
+ else:
61
+ raise ValueError("Downloaded file has different hash. Please try again.")
62
+
63
+
64
+ def _download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_ssl=True):
65
+ """
66
+ Download an given URL
67
+
68
+ Parameters
69
+ ----------
70
+ url : str
71
+ URL to download
72
+ path : str, optional
73
+ Destination path to store downloaded file. By default stores to the
74
+ current directory with same name as in url.
75
+ overwrite : bool, optional
76
+ Whether to overwrite destination file if already exists.
77
+ sha1_hash : str, optional
78
+ Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
79
+ but doesn't match.
80
+ retries : integer, default 5
81
+ The number of times to attempt the download in case of failure or non 200 return codes
82
+ verify_ssl : bool, default True
83
+ Verify SSL certificates.
84
+
85
+ Returns
86
+ -------
87
+ str
88
+ The file path of the downloaded file.
89
+ """
90
+ import warnings
91
+ try:
92
+ import requests
93
+ except ImportError:
94
+ class requests_failed_to_import(object):
95
+ pass
96
+ requests = requests_failed_to_import
97
+
98
+ if path is None:
99
+ fname = url.split("/")[-1]
100
+ # Empty filenames are invalid
101
+ assert fname, "Can't construct file-name from this URL. Please set the `path` option manually."
102
+ else:
103
+ path = os.path.expanduser(path)
104
+ if os.path.isdir(path):
105
+ fname = os.path.join(path, url.split("/")[-1])
106
+ else:
107
+ fname = path
108
+ assert retries >= 0, "Number of retries should be at least 0"
109
+
110
+ if not verify_ssl:
111
+ warnings.warn(
112
+ "Unverified HTTPS request is being made (verify_ssl=False). "
113
+ "Adding certificate verification is strongly advised.")
114
+
115
+ if overwrite or not os.path.exists(fname) or (sha1_hash and not _check_sha1(fname, sha1_hash)):
116
+ dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
117
+ if not os.path.exists(dirname):
118
+ os.makedirs(dirname)
119
+ while retries + 1 > 0:
120
+ # Disable pyling too broad Exception
121
+ # pylint: disable=W0703
122
+ try:
123
+ print("Downloading {} from {}...".format(fname, url))
124
+ r = requests.get(url, stream=True, verify=verify_ssl)
125
+ if r.status_code != 200:
126
+ raise RuntimeError("Failed downloading url {}".format(url))
127
+ with open(fname, "wb") as f:
128
+ for chunk in r.iter_content(chunk_size=1024):
129
+ if chunk: # filter out keep-alive new chunks
130
+ f.write(chunk)
131
+ if sha1_hash and not _check_sha1(fname, sha1_hash):
132
+ raise UserWarning("File {} is downloaded but the content hash does not match."
133
+ " The repo may be outdated or download may be incomplete. "
134
+ "If the `repo_url` is overridden, consider switching to "
135
+ "the default repo.".format(fname))
136
+ break
137
+ except Exception as e:
138
+ retries -= 1
139
+ if retries <= 0:
140
+ raise e
141
+ else:
142
+ print("download failed, retrying, {} attempt{} left"
143
+ .format(retries, "s" if retries > 1 else ""))
144
+
145
+ return fname
146
+
147
+
148
+ def _check_sha1(filename, sha1_hash):
149
+ """
150
+ Check whether the sha1 hash of the file content matches the expected hash.
151
+
152
+ Parameters
153
+ ----------
154
+ filename : str
155
+ Path to the file.
156
+ sha1_hash : str
157
+ Expected sha1 hash in hexadecimal digits.
158
+
159
+ Returns
160
+ -------
161
+ bool
162
+ Whether the file content matches the expected hash.
163
+ """
164
+ sha1 = hashlib.sha1()
165
+ with open(filename, "rb") as f:
166
+ while True:
167
+ data = f.read(1048576)
168
+ if not data:
169
+ break
170
+ sha1.update(data)
171
+
172
+ return sha1.hexdigest() == sha1_hash
data_utils/deepspeech_features/extract_ds_features.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script for extracting DeepSpeech features from audio file.
3
+ """
4
+
5
+ import os
6
+ import argparse
7
+ import numpy as np
8
+ import pandas as pd
9
+ from deepspeech_store import get_deepspeech_model_file
10
+ from deepspeech_features import conv_audios_to_deepspeech
11
+
12
+
13
+ def parse_args():
14
+ """
15
+ Create python script parameters.
16
+ Returns
17
+ -------
18
+ ArgumentParser
19
+ Resulted args.
20
+ """
21
+ parser = argparse.ArgumentParser(
22
+ description="Extract DeepSpeech features from audio file",
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
24
+ parser.add_argument(
25
+ "--input",
26
+ type=str,
27
+ required=True,
28
+ help="path to input audio file or directory")
29
+ parser.add_argument(
30
+ "--output",
31
+ type=str,
32
+ help="path to output file with DeepSpeech features")
33
+ parser.add_argument(
34
+ "--deepspeech",
35
+ type=str,
36
+ help="path to DeepSpeech 0.1.0 frozen model")
37
+ parser.add_argument(
38
+ "--metainfo",
39
+ type=str,
40
+ help="path to file with meta-information")
41
+
42
+ args = parser.parse_args()
43
+ return args
44
+
45
+
46
+ def extract_features(in_audios,
47
+ out_files,
48
+ deepspeech_pb_path,
49
+ metainfo_file_path=None):
50
+ """
51
+ Real extract audio from video file.
52
+ Parameters
53
+ ----------
54
+ in_audios : list of str
55
+ Paths to input audio files.
56
+ out_files : list of str
57
+ Paths to output files with DeepSpeech features.
58
+ deepspeech_pb_path : str
59
+ Path to DeepSpeech 0.1.0 frozen model.
60
+ metainfo_file_path : str, default None
61
+ Path to file with meta-information.
62
+ """
63
+ if metainfo_file_path is None:
64
+ num_frames_info = [None] * len(in_audios)
65
+ else:
66
+ train_df = pd.read_csv(
67
+ metainfo_file_path,
68
+ sep="\t",
69
+ index_col=False,
70
+ dtype={"Id": np.int, "File": np.unicode, "Count": np.int})
71
+ num_frames_info = train_df["Count"].values
72
+ assert (len(num_frames_info) == len(in_audios))
73
+
74
+ for i, in_audio in enumerate(in_audios):
75
+ if not out_files[i]:
76
+ file_stem, _ = os.path.splitext(in_audio)
77
+ out_files[i] = file_stem + ".npy"
78
+ #print(out_files[i])
79
+ conv_audios_to_deepspeech(
80
+ audios=in_audios,
81
+ out_files=out_files,
82
+ num_frames_info=num_frames_info,
83
+ deepspeech_pb_path=deepspeech_pb_path)
84
+
85
+
86
+ def main():
87
+ """
88
+ Main body of script.
89
+ """
90
+ args = parse_args()
91
+ in_audio = os.path.expanduser(args.input)
92
+ if not os.path.exists(in_audio):
93
+ raise Exception("Input file/directory doesn't exist: {}".format(in_audio))
94
+ deepspeech_pb_path = args.deepspeech
95
+ #add
96
+ deepspeech_pb_path = True
97
+ args.deepspeech = '~/.tensorflow/models/deepspeech-0_1_0-b90017e8.pb'
98
+ if deepspeech_pb_path is None:
99
+ deepspeech_pb_path = ""
100
+ if deepspeech_pb_path:
101
+ deepspeech_pb_path = os.path.expanduser(args.deepspeech)
102
+ if not os.path.exists(deepspeech_pb_path):
103
+ deepspeech_pb_path = get_deepspeech_model_file()
104
+ if os.path.isfile(in_audio):
105
+ extract_features(
106
+ in_audios=[in_audio],
107
+ out_files=[args.output],
108
+ deepspeech_pb_path=deepspeech_pb_path,
109
+ metainfo_file_path=args.metainfo)
110
+ else:
111
+ audio_file_paths = []
112
+ for file_name in os.listdir(in_audio):
113
+ if not os.path.isfile(os.path.join(in_audio, file_name)):
114
+ continue
115
+ _, file_ext = os.path.splitext(file_name)
116
+ if file_ext.lower() == ".wav":
117
+ audio_file_path = os.path.join(in_audio, file_name)
118
+ audio_file_paths.append(audio_file_path)
119
+ audio_file_paths = sorted(audio_file_paths)
120
+ out_file_paths = [""] * len(audio_file_paths)
121
+ extract_features(
122
+ in_audios=audio_file_paths,
123
+ out_files=out_file_paths,
124
+ deepspeech_pb_path=deepspeech_pb_path,
125
+ metainfo_file_path=args.metainfo)
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()
130
+
data_utils/deepspeech_features/extract_wav.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script for extracting audio (16-bit, mono, 22000 Hz) from video file.
3
+ """
4
+
5
+ import os
6
+ import argparse
7
+ import subprocess
8
+
9
+
10
+ def parse_args():
11
+ """
12
+ Create python script parameters.
13
+
14
+ Returns
15
+ -------
16
+ ArgumentParser
17
+ Resulted args.
18
+ """
19
+ parser = argparse.ArgumentParser(
20
+ description="Extract audio from video file",
21
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22
+ parser.add_argument(
23
+ "--in-video",
24
+ type=str,
25
+ required=True,
26
+ help="path to input video file or directory")
27
+ parser.add_argument(
28
+ "--out-audio",
29
+ type=str,
30
+ help="path to output audio file")
31
+
32
+ args = parser.parse_args()
33
+ return args
34
+
35
+
36
+ def extract_audio(in_video,
37
+ out_audio):
38
+ """
39
+ Real extract audio from video file.
40
+
41
+ Parameters
42
+ ----------
43
+ in_video : str
44
+ Path to input video file.
45
+ out_audio : str
46
+ Path to output audio file.
47
+ """
48
+ if not out_audio:
49
+ file_stem, _ = os.path.splitext(in_video)
50
+ out_audio = file_stem + ".wav"
51
+ # command1 = "ffmpeg -i {in_video} -vn -acodec copy {aac_audio}"
52
+ # command2 = "ffmpeg -i {aac_audio} -vn -acodec pcm_s16le -ac 1 -ar 22000 {out_audio}"
53
+ # command = "ffmpeg -i {in_video} -vn -acodec pcm_s16le -ac 1 -ar 22000 {out_audio}"
54
+ command = "ffmpeg -i {in_video} -vn -acodec pcm_s16le -ac 1 -ar 16000 {out_audio}"
55
+ subprocess.call([command.format(in_video=in_video, out_audio=out_audio)], shell=True)
56
+
57
+
58
+ def main():
59
+ """
60
+ Main body of script.
61
+ """
62
+ args = parse_args()
63
+ in_video = os.path.expanduser(args.in_video)
64
+ if not os.path.exists(in_video):
65
+ raise Exception("Input file/directory doesn't exist: {}".format(in_video))
66
+ if os.path.isfile(in_video):
67
+ extract_audio(
68
+ in_video=in_video,
69
+ out_audio=args.out_audio)
70
+ else:
71
+ video_file_paths = []
72
+ for file_name in os.listdir(in_video):
73
+ if not os.path.isfile(os.path.join(in_video, file_name)):
74
+ continue
75
+ _, file_ext = os.path.splitext(file_name)
76
+ if file_ext.lower() in (".mp4", ".mkv", ".avi"):
77
+ video_file_path = os.path.join(in_video, file_name)
78
+ video_file_paths.append(video_file_path)
79
+ video_file_paths = sorted(video_file_paths)
80
+ for video_file_path in video_file_paths:
81
+ extract_audio(
82
+ in_video=video_file_path,
83
+ out_audio="")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
data_utils/deepspeech_features/fea_win.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ net_output = np.load('french.ds.npy').reshape(-1, 29)
4
+ win_size = 16
5
+ zero_pad = np.zeros((int(win_size / 2), net_output.shape[1]))
6
+ net_output = np.concatenate((zero_pad, net_output, zero_pad), axis=0)
7
+ windows = []
8
+ for window_index in range(0, net_output.shape[0] - win_size, 2):
9
+ windows.append(net_output[window_index:window_index + win_size])
10
+ print(np.array(windows).shape)
11
+ np.save('aud_french.npy', np.array(windows))
data_utils/easyportrait/create_teeth_mask.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from argparse import ArgumentParser
3
+
4
+ from mmseg.apis import inference_segmentor, init_segmentor, show_result_pyplot
5
+
6
+ import os
7
+ import glob
8
+ from tqdm import tqdm
9
+ import numpy as np
10
+
11
+ def main():
12
+ parser = ArgumentParser()
13
+ parser.add_argument('datset', help='Image file')
14
+ parser.add_argument('--config', default="./data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fpn-fp/fpn-fp.py", help='Config file')
15
+ parser.add_argument('--checkpoint', default="./data_utils/easyportrait/fpn-fp-512.pth", help='Checkpoint file')
16
+
17
+ args = parser.parse_args()
18
+
19
+ # build the model from a config file and a checkpoint file
20
+ model = init_segmentor(args.config, args.checkpoint, device='cuda:0')
21
+
22
+ # test a single image
23
+ dataset_path = os.path.join(args.datset, 'ori_imgs')
24
+ out_path = os.path.join(args.datset, 'teeth_mask')
25
+ os.makedirs(out_path, exist_ok=True)
26
+
27
+ for file in tqdm(glob.glob(os.path.join(dataset_path, '*.jpg'))):
28
+ result = inference_segmentor(model, file)
29
+ result[0][result[0]!=7] = 0
30
+ np.save(file.replace('jpg', 'npy').replace('ori_imgs', 'teeth_mask'), result[0].astype(np.bool_))
31
+
32
+
33
+ if __name__ == '__main__':
34
+ main()
data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_1024x1024.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'EasyPortraitDataset'
3
+ data_root = 'path/to/data/EasyPortrait'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Pad', size=(1920, 1920), pad_val=0, seg_pad_val=255),
11
+ dict(type='Resize', img_scale=(1024, 1024)),
12
+
13
+ # We don't use RandomFlip, but need it in the code to fix error: https://github.com/open-mmlab/mmsegmentation/issues/231
14
+ dict(type='RandomFlip', prob=0.0),
15
+ dict(type='PhotoMetricDistortion',
16
+ brightness_delta=16,
17
+ contrast_range=(0.5, 1.0),
18
+ saturation_range=(0.5, 1.0),
19
+ hue_delta=9),
20
+ dict(type='Normalize', **img_norm_cfg),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
23
+ ]
24
+
25
+ test_pipeline = [
26
+ dict(type='LoadImageFromFile'),
27
+ dict(
28
+ type='MultiScaleFlipAug',
29
+ img_scale=(1024, 1024),
30
+ flip=False,
31
+ transforms=[
32
+ dict(type='Resize', keep_ratio=True),
33
+ dict(type='Normalize', **img_norm_cfg),
34
+ dict(type='ImageToTensor', keys=['img']),
35
+ dict(type='Collect', keys=['img']),
36
+ ])
37
+ ]
38
+
39
+ data = dict(
40
+ samples_per_gpu=4,
41
+ workers_per_gpu=4,
42
+ train=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/train',
46
+ ann_dir='annotations/train',
47
+ pipeline=train_pipeline),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/val',
52
+ ann_dir='annotations/val',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/test',
58
+ ann_dir='annotations/test',
59
+ pipeline=test_pipeline))
data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_384x384.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'EasyPortraitDataset'
3
+ data_root = 'path/to/data/EasyPortrait'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Pad', size=(1920, 1920), pad_val=0, seg_pad_val=255),
11
+ dict(type='Resize', img_scale=(384, 384)),
12
+
13
+ # We don't use RandomFlip, but need it in the code to fix error: https://github.com/open-mmlab/mmsegmentation/issues/231
14
+ dict(type='RandomFlip', prob=0.0),
15
+ dict(type='PhotoMetricDistortion',
16
+ brightness_delta=16,
17
+ contrast_range=(0.5, 1.0),
18
+ saturation_range=(0.5, 1.0),
19
+ hue_delta=9),
20
+ dict(type='Normalize', **img_norm_cfg),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
23
+ ]
24
+
25
+ test_pipeline = [
26
+ dict(type='LoadImageFromFile'),
27
+ dict(
28
+ type='MultiScaleFlipAug',
29
+ img_scale=(384, 384),
30
+ flip=False,
31
+ transforms=[
32
+ dict(type='Resize', keep_ratio=True),
33
+ dict(type='Normalize', **img_norm_cfg),
34
+ dict(type='ImageToTensor', keys=['img']),
35
+ dict(type='Collect', keys=['img']),
36
+ ])
37
+ ]
38
+
39
+ data = dict(
40
+ samples_per_gpu=4,
41
+ workers_per_gpu=4,
42
+ train=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/train',
46
+ ann_dir='annotations/train',
47
+ pipeline=train_pipeline),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/val',
52
+ ann_dir='annotations/val',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/test',
58
+ ann_dir='annotations/test',
59
+ pipeline=test_pipeline))
data_utils/easyportrait/local_configs/__base__/datasets/easyportrait_512x512.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'EasyPortraitDataset'
3
+ data_root = 'path/to/data/EasyPortrait'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Pad', size=(1920, 1920), pad_val=0, seg_pad_val=255),
11
+ dict(type='Resize', img_scale=(512, 512)),
12
+
13
+ # We don't use RandomFlip, but need it in the code to fix error: https://github.com/open-mmlab/mmsegmentation/issues/231
14
+ dict(type='RandomFlip', prob=0.0),
15
+ dict(type='PhotoMetricDistortion',
16
+ brightness_delta=16,
17
+ contrast_range=(0.5, 1.0),
18
+ saturation_range=(0.5, 1.0),
19
+ hue_delta=9),
20
+ dict(type='Normalize', **img_norm_cfg),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
23
+ ]
24
+
25
+ test_pipeline = [
26
+ dict(type='LoadImageFromFile'),
27
+ dict(
28
+ type='MultiScaleFlipAug',
29
+ img_scale=(512, 512),
30
+ flip=False,
31
+ transforms=[
32
+ dict(type='Resize', keep_ratio=True),
33
+ dict(type='Normalize', **img_norm_cfg),
34
+ dict(type='ImageToTensor', keys=['img']),
35
+ dict(type='Collect', keys=['img']),
36
+ ])
37
+ ]
38
+
39
+ data = dict(
40
+ samples_per_gpu=4,
41
+ workers_per_gpu=4,
42
+ train=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/train',
46
+ ann_dir='annotations/train',
47
+ pipeline=train_pipeline),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/val',
52
+ ann_dir='annotations/val',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/test',
58
+ ann_dir='annotations/test',
59
+ pipeline=test_pipeline))
data_utils/easyportrait/local_configs/__base__/default_runtime.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yapf:disable
2
+ log_config = dict(
3
+ interval=50,
4
+ hooks=[
5
+ dict(type='TextLoggerHook', by_epoch=False),
6
+ # dict(type='TensorboardLoggerHook')
7
+ ])
8
+ # yapf:enable
9
+ dist_params = dict(backend='nccl')
10
+ log_level = 'INFO'
11
+ load_from = None
12
+ resume_from = None
13
+ workflow = [('train', 1)]
14
+ cudnn_benchmark = True
data_utils/easyportrait/local_configs/__base__/models/bisenetv2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='BiSeNetV2',
8
+ detail_channels=(64, 64, 128),
9
+ semantic_channels=(16, 32, 64, 128),
10
+ semantic_expansion_ratio=6,
11
+ bga_channels=128,
12
+ out_indices=(0, 1, 2, 3, 4),
13
+ init_cfg=None,
14
+ align_corners=False),
15
+ decode_head=dict(
16
+ type='FCNHead',
17
+ in_channels=128,
18
+ in_index=0,
19
+ channels=1024,
20
+ num_convs=1,
21
+ concat_input=False,
22
+ dropout_ratio=0.1,
23
+ num_classes=19,
24
+ norm_cfg=norm_cfg,
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=[
29
+ dict(
30
+ type='FCNHead',
31
+ in_channels=16,
32
+ channels=16,
33
+ num_convs=2,
34
+ num_classes=19,
35
+ in_index=1,
36
+ norm_cfg=norm_cfg,
37
+ concat_input=False,
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
41
+ dict(
42
+ type='FCNHead',
43
+ in_channels=32,
44
+ channels=64,
45
+ num_convs=2,
46
+ num_classes=19,
47
+ in_index=2,
48
+ norm_cfg=norm_cfg,
49
+ concat_input=False,
50
+ align_corners=False,
51
+ loss_decode=dict(
52
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
53
+ dict(
54
+ type='FCNHead',
55
+ in_channels=64,
56
+ channels=256,
57
+ num_convs=2,
58
+ num_classes=19,
59
+ in_index=3,
60
+ norm_cfg=norm_cfg,
61
+ concat_input=False,
62
+ align_corners=False,
63
+ loss_decode=dict(
64
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
65
+ dict(
66
+ type='FCNHead',
67
+ in_channels=128,
68
+ channels=1024,
69
+ num_convs=2,
70
+ num_classes=19,
71
+ in_index=4,
72
+ norm_cfg=norm_cfg,
73
+ concat_input=False,
74
+ align_corners=False,
75
+ loss_decode=dict(
76
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
77
+ ],
78
+ # model training and testing settings
79
+ train_cfg=dict(),
80
+ test_cfg=dict(mode='whole'))
data_utils/easyportrait/local_configs/__base__/models/fcn_resnet50.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='FCNHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ num_convs=2,
23
+ concat_input=True,
24
+ dropout_ratio=0.1,
25
+ num_classes=19,
26
+ norm_cfg=norm_cfg,
27
+ align_corners=False,
28
+ loss_decode=dict(
29
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
30
+ auxiliary_head=dict(
31
+ type='FCNHead',
32
+ in_channels=1024,
33
+ in_index=2,
34
+ channels=256,
35
+ num_convs=1,
36
+ concat_input=False,
37
+ dropout_ratio=0.1,
38
+ num_classes=19,
39
+ norm_cfg=norm_cfg,
40
+ align_corners=False,
41
+ loss_decode=dict(
42
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
43
+ # model training and testing settings
44
+ train_cfg=dict(),
45
+ test_cfg=dict(mode='whole'))
data_utils/easyportrait/local_configs/__base__/models/fpn_resnet50.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 1, 1),
12
+ strides=(1, 2, 2, 2),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ neck=dict(
18
+ type='FPN',
19
+ in_channels=[256, 512, 1024, 2048],
20
+ out_channels=256,
21
+ num_outs=4),
22
+ decode_head=dict(
23
+ type='FPNHead',
24
+ in_channels=[256, 256, 256, 256],
25
+ in_index=[0, 1, 2, 3],
26
+ feature_strides=[4, 8, 16, 32],
27
+ channels=128,
28
+ dropout_ratio=0.1,
29
+ num_classes=19,
30
+ norm_cfg=norm_cfg,
31
+ align_corners=False,
32
+ loss_decode=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34
+ # model training and testing settings
35
+ train_cfg=dict(),
36
+ test_cfg=dict(mode='whole'))
data_utils/easyportrait/local_configs/__base__/models/lraspp.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='MobileNetV3',
7
+ arch='large',
8
+ out_indices=(1, 3, 16),
9
+ norm_cfg=norm_cfg),
10
+ decode_head=dict(
11
+ type='LRASPPHead',
12
+ in_channels=(16, 24, 960),
13
+ in_index=(0, 1, 2),
14
+ channels=128,
15
+ input_transform='multiple_select',
16
+ dropout_ratio=0.1,
17
+ num_classes=19,
18
+ norm_cfg=norm_cfg,
19
+ act_cfg=dict(type='ReLU'),
20
+ align_corners=False,
21
+ loss_decode=dict(
22
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23
+ # model training and testing settings
24
+ train_cfg=dict(),
25
+ test_cfg=dict(mode='whole'))
data_utils/easyportrait/local_configs/__base__/models/segformer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='MixVisionTransformer',
8
+ in_channels=3,
9
+ embed_dims=32,
10
+ num_stages=4,
11
+ num_layers=[2, 2, 2, 2],
12
+ num_heads=[1, 2, 5, 8],
13
+ patch_sizes=[7, 3, 3, 3],
14
+ sr_ratios=[8, 4, 2, 1],
15
+ out_indices=(0, 1, 2, 3),
16
+ mlp_ratio=4,
17
+ qkv_bias=True,
18
+ drop_rate=0.0,
19
+ attn_drop_rate=0.0,
20
+ drop_path_rate=0.1),
21
+ decode_head=dict(
22
+ type='SegformerHead',
23
+ in_channels=[32, 64, 160, 256],
24
+ in_index=[0, 1, 2, 3],
25
+ channels=256,
26
+ dropout_ratio=0.1,
27
+ num_classes=19,
28
+ norm_cfg=norm_cfg,
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ # model training and testing settings
33
+ train_cfg=dict(),
34
+ test_cfg=dict(mode='whole'))
data_utils/easyportrait/local_configs/__base__/schedules/schedule_10k_adamw.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
3
+ optimizer_config = dict()
4
+
5
+ # learning policy
6
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
7
+
8
+ # runtime settings
9
+ runner = dict(type='IterBasedRunner', max_iters=10000)
10
+ checkpoint_config = dict(by_epoch=False, interval=2000)
11
+ evaluation = dict(interval=2000, metric='mIoU')
data_utils/easyportrait/local_configs/__base__/schedules/schedule_160k_adamw.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=160000)
8
+ checkpoint_config = dict(by_epoch=False, interval=4000)
9
+ evaluation = dict(interval=4000, metric='mIoU')
data_utils/easyportrait/local_configs/__base__/schedules/schedule_20k_adamw.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
3
+ optimizer_config = dict()
4
+
5
+ # learning policy
6
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
7
+
8
+ # runtime settings
9
+ runner = dict(type='IterBasedRunner', max_iters=20000)
10
+ checkpoint_config = dict(by_epoch=False, interval=2000)
11
+ evaluation = dict(interval=2000, metric='mIoU')
data_utils/easyportrait/local_configs/__base__/schedules/schedule_40k_adamw.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=40000)
8
+ checkpoint_config = dict(by_epoch=False, interval=4000)
9
+ evaluation = dict(interval=4000, metric='mIoU')
data_utils/easyportrait/local_configs/__base__/schedules/schedule_80k_adamw.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # optimizer
2
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
3
+ optimizer_config = dict()
4
+ # learning policy
5
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
6
+ # runtime settings
7
+ runner = dict(type='IterBasedRunner', max_iters=80000)
8
+ checkpoint_config = dict(by_epoch=False, interval=4000)
9
+ evaluation = dict(interval=4000, metric='mIoU')
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/bisenet-fp/bisenetv2-fp.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='BiSeNetV2',
7
+ detail_channels=(64, 64, 128),
8
+ semantic_channels=(16, 32, 64, 128),
9
+ semantic_expansion_ratio=6,
10
+ bga_channels=128,
11
+ out_indices=(0, 1, 2, 3, 4),
12
+ init_cfg=None,
13
+ align_corners=False),
14
+ decode_head=dict(
15
+ type='FCNHead',
16
+ in_channels=128,
17
+ in_index=0,
18
+ channels=1024,
19
+ num_convs=1,
20
+ concat_input=False,
21
+ dropout_ratio=0.1,
22
+ num_classes=19,
23
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
24
+ align_corners=False,
25
+ loss_decode=dict(
26
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
27
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000)),
28
+ auxiliary_head=[
29
+ dict(
30
+ type='FCNHead',
31
+ in_channels=16,
32
+ channels=16,
33
+ num_convs=2,
34
+ num_classes=8,
35
+ in_index=1,
36
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
37
+ concat_input=False,
38
+ align_corners=False,
39
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
42
+ dict(
43
+ type='FCNHead',
44
+ in_channels=32,
45
+ channels=64,
46
+ num_convs=2,
47
+ num_classes=8,
48
+ in_index=2,
49
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
50
+ concat_input=False,
51
+ align_corners=False,
52
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
53
+ loss_decode=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
55
+ dict(
56
+ type='FCNHead',
57
+ in_channels=64,
58
+ channels=256,
59
+ num_convs=2,
60
+ num_classes=8,
61
+ in_index=3,
62
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
63
+ concat_input=False,
64
+ align_corners=False,
65
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
66
+ loss_decode=dict(
67
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
68
+ dict(
69
+ type='FCNHead',
70
+ in_channels=128,
71
+ channels=1024,
72
+ num_convs=2,
73
+ num_classes=8,
74
+ in_index=4,
75
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
76
+ concat_input=False,
77
+ align_corners=False,
78
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
79
+ loss_decode=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
81
+ ],
82
+ train_cfg=dict(),
83
+ test_cfg=dict(mode='whole'))
84
+ dataset_type = 'EasyPortraitFPDataset'
85
+ data_root = '/home/jovyan/datasets/wacv_24/'
86
+ img_norm_cfg = dict(
87
+ mean=[143.55267075, 132.96705975, 126.94924335],
88
+ std=[60.2625333, 60.32740275, 59.30988645],
89
+ to_rgb=True)
90
+ train_pipeline = [
91
+ dict(type='LoadImageFromFile'),
92
+ dict(type='LoadAnnotations'),
93
+ dict(type='RandomFlip', prob=0.0),
94
+ dict(
95
+ type='PhotoMetricDistortion',
96
+ brightness_delta=16,
97
+ contrast_range=(0.5, 1.0),
98
+ saturation_range=(0.5, 1.0),
99
+ hue_delta=5),
100
+ dict(
101
+ type='Normalize',
102
+ mean=[143.55267075, 132.96705975, 126.94924335],
103
+ std=[60.2625333, 60.32740275, 59.30988645],
104
+ to_rgb=True),
105
+ dict(type='DefaultFormatBundle'),
106
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
107
+ ]
108
+ test_pipeline = [
109
+ dict(type='LoadImageFromFile'),
110
+ dict(
111
+ type='MultiScaleFlipAug',
112
+ img_scale=(384, 384),
113
+ flip=False,
114
+ transforms=[
115
+ dict(
116
+ type='Normalize',
117
+ mean=[143.55267075, 132.96705975, 126.94924335],
118
+ std=[60.2625333, 60.32740275, 59.30988645],
119
+ to_rgb=True),
120
+ dict(type='ImageToTensor', keys=['img']),
121
+ dict(type='Collect', keys=['img'])
122
+ ])
123
+ ]
124
+ data = dict(
125
+ train=dict(
126
+ type='EasyPortraitFPDataset',
127
+ data_root='/home/jovyan/datasets/wacv_24/',
128
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
129
+ 'right eye', 'lips', 'teeth'),
130
+ img_dir='easyportrait_384/images/train',
131
+ ann_dir='easyportrait_384/annotations_fp/train',
132
+ pipeline=[
133
+ dict(type='LoadImageFromFile'),
134
+ dict(type='LoadAnnotations'),
135
+ dict(type='RandomFlip', prob=0.0),
136
+ dict(
137
+ type='PhotoMetricDistortion',
138
+ brightness_delta=16,
139
+ contrast_range=(0.5, 1.0),
140
+ saturation_range=(0.5, 1.0),
141
+ hue_delta=5),
142
+ dict(
143
+ type='Normalize',
144
+ mean=[143.55267075, 132.96705975, 126.94924335],
145
+ std=[60.2625333, 60.32740275, 59.30988645],
146
+ to_rgb=True),
147
+ dict(type='DefaultFormatBundle'),
148
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
149
+ ]),
150
+ val=dict(
151
+ type='EasyPortraitFPDataset',
152
+ data_root='/home/jovyan/datasets/wacv_24/',
153
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
154
+ 'right eye', 'lips', 'teeth'),
155
+ img_dir='easyportrait_384/images/val',
156
+ ann_dir='easyportrait_384/annotations_fp/val',
157
+ pipeline=[
158
+ dict(type='LoadImageFromFile'),
159
+ dict(
160
+ type='MultiScaleFlipAug',
161
+ img_scale=(384, 384),
162
+ flip=False,
163
+ transforms=[
164
+ dict(
165
+ type='Normalize',
166
+ mean=[143.55267075, 132.96705975, 126.94924335],
167
+ std=[60.2625333, 60.32740275, 59.30988645],
168
+ to_rgb=True),
169
+ dict(type='ImageToTensor', keys=['img']),
170
+ dict(type='Collect', keys=['img'])
171
+ ])
172
+ ]),
173
+ test=dict(
174
+ type='EasyPortraitFPDataset',
175
+ data_root='/home/jovyan/datasets/wacv_24/',
176
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
177
+ 'right eye', 'lips', 'teeth'),
178
+ img_dir='easyportrait_384/images/test',
179
+ ann_dir='easyportrait_384/annotations_fp/test',
180
+ pipeline=[
181
+ dict(type='LoadImageFromFile'),
182
+ dict(
183
+ type='MultiScaleFlipAug',
184
+ img_scale=(384, 384),
185
+ flip=False,
186
+ transforms=[
187
+ dict(
188
+ type='Normalize',
189
+ mean=[143.55267075, 132.96705975, 126.94924335],
190
+ std=[60.2625333, 60.32740275, 59.30988645],
191
+ to_rgb=True),
192
+ dict(type='ImageToTensor', keys=['img']),
193
+ dict(type='Collect', keys=['img'])
194
+ ])
195
+ ]),
196
+ samples_per_gpu=32,
197
+ workers_per_gpu=8)
198
+ log_config = dict(
199
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
200
+ dist_params = dict(backend='nccl')
201
+ log_level = 'INFO'
202
+ load_from = None
203
+ resume_from = None
204
+ workflow = [('train', 1)]
205
+ cudnn_benchmark = True
206
+ optimizer = dict(type='AdamW', lr=0.05, weight_decay=0.0001)
207
+ optimizer_config = dict()
208
+ lr_config = dict(
209
+ policy='poly',
210
+ power=0.9,
211
+ min_lr=0.0,
212
+ by_epoch=True,
213
+ warmup='linear',
214
+ warmup_iters=1000)
215
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
216
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
217
+ checkpoint_config = dict(by_epoch=True, interval=100)
218
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
219
+ work_dir = 'work_dirs/petrova/bisenet-fp'
220
+ gpu_ids = [0]
221
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/bisenet-ps/bisenetv2-ps.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained=None,
5
+ backbone=dict(
6
+ type='BiSeNetV2',
7
+ detail_channels=(64, 64, 128),
8
+ semantic_channels=(16, 32, 64, 128),
9
+ semantic_expansion_ratio=6,
10
+ bga_channels=128,
11
+ out_indices=(0, 1, 2, 3, 4),
12
+ init_cfg=None,
13
+ align_corners=False),
14
+ decode_head=dict(
15
+ type='FCNHead',
16
+ in_channels=128,
17
+ in_index=0,
18
+ channels=1024,
19
+ num_convs=1,
20
+ concat_input=False,
21
+ dropout_ratio=0.1,
22
+ num_classes=19,
23
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
24
+ align_corners=False,
25
+ loss_decode=dict(
26
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
27
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000)),
28
+ auxiliary_head=[
29
+ dict(
30
+ type='FCNHead',
31
+ in_channels=16,
32
+ channels=16,
33
+ num_convs=2,
34
+ num_classes=2,
35
+ in_index=1,
36
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
37
+ concat_input=False,
38
+ align_corners=False,
39
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
42
+ dict(
43
+ type='FCNHead',
44
+ in_channels=32,
45
+ channels=64,
46
+ num_convs=2,
47
+ num_classes=2,
48
+ in_index=2,
49
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
50
+ concat_input=False,
51
+ align_corners=False,
52
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
53
+ loss_decode=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
55
+ dict(
56
+ type='FCNHead',
57
+ in_channels=64,
58
+ channels=256,
59
+ num_convs=2,
60
+ num_classes=2,
61
+ in_index=3,
62
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
63
+ concat_input=False,
64
+ align_corners=False,
65
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
66
+ loss_decode=dict(
67
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
68
+ dict(
69
+ type='FCNHead',
70
+ in_channels=128,
71
+ channels=1024,
72
+ num_convs=2,
73
+ num_classes=2,
74
+ in_index=4,
75
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
76
+ concat_input=False,
77
+ align_corners=False,
78
+ sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
79
+ loss_decode=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
81
+ ],
82
+ train_cfg=dict(),
83
+ test_cfg=dict(mode='whole'))
84
+ dataset_type = 'EasyPortraitPSDataset'
85
+ data_root = '/home/jovyan/datasets/wacv_24/'
86
+ img_norm_cfg = dict(
87
+ mean=[143.55267075, 132.96705975, 126.94924335],
88
+ std=[60.2625333, 60.32740275, 59.30988645],
89
+ to_rgb=True)
90
+ train_pipeline = [
91
+ dict(type='LoadImageFromFile'),
92
+ dict(type='LoadAnnotations'),
93
+ dict(type='RandomFlip', prob=0.0),
94
+ dict(
95
+ type='PhotoMetricDistortion',
96
+ brightness_delta=16,
97
+ contrast_range=(0.5, 1.0),
98
+ saturation_range=(0.5, 1.0),
99
+ hue_delta=5),
100
+ dict(
101
+ type='Normalize',
102
+ mean=[143.55267075, 132.96705975, 126.94924335],
103
+ std=[60.2625333, 60.32740275, 59.30988645],
104
+ to_rgb=True),
105
+ dict(type='DefaultFormatBundle'),
106
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
107
+ ]
108
+ test_pipeline = [
109
+ dict(type='LoadImageFromFile'),
110
+ dict(
111
+ type='MultiScaleFlipAug',
112
+ img_scale=(384, 384),
113
+ flip=False,
114
+ transforms=[
115
+ dict(
116
+ type='Normalize',
117
+ mean=[143.55267075, 132.96705975, 126.94924335],
118
+ std=[60.2625333, 60.32740275, 59.30988645],
119
+ to_rgb=True),
120
+ dict(type='ImageToTensor', keys=['img']),
121
+ dict(type='Collect', keys=['img'])
122
+ ])
123
+ ]
124
+ data = dict(
125
+ train=dict(
126
+ type='EasyPortraitPSDataset',
127
+ data_root='/home/jovyan/datasets/wacv_24/',
128
+ classes=('background', 'person'),
129
+ img_dir='easyportrait_384/images/train',
130
+ ann_dir='easyportrait_384/annotations_ps/train',
131
+ pipeline=[
132
+ dict(type='LoadImageFromFile'),
133
+ dict(type='LoadAnnotations'),
134
+ dict(type='RandomFlip', prob=0.0),
135
+ dict(
136
+ type='PhotoMetricDistortion',
137
+ brightness_delta=16,
138
+ contrast_range=(0.5, 1.0),
139
+ saturation_range=(0.5, 1.0),
140
+ hue_delta=5),
141
+ dict(
142
+ type='Normalize',
143
+ mean=[143.55267075, 132.96705975, 126.94924335],
144
+ std=[60.2625333, 60.32740275, 59.30988645],
145
+ to_rgb=True),
146
+ dict(type='DefaultFormatBundle'),
147
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
148
+ ]),
149
+ val=dict(
150
+ type='EasyPortraitPSDataset',
151
+ data_root='/home/jovyan/datasets/wacv_24/',
152
+ classes=('background', 'person'),
153
+ img_dir='easyportrait_384/images/val',
154
+ ann_dir='easyportrait_384/annotations_ps/val',
155
+ pipeline=[
156
+ dict(type='LoadImageFromFile'),
157
+ dict(
158
+ type='MultiScaleFlipAug',
159
+ img_scale=(384, 384),
160
+ flip=False,
161
+ transforms=[
162
+ dict(
163
+ type='Normalize',
164
+ mean=[143.55267075, 132.96705975, 126.94924335],
165
+ std=[60.2625333, 60.32740275, 59.30988645],
166
+ to_rgb=True),
167
+ dict(type='ImageToTensor', keys=['img']),
168
+ dict(type='Collect', keys=['img'])
169
+ ])
170
+ ]),
171
+ test=dict(
172
+ type='EasyPortraitPSDataset',
173
+ data_root='/home/jovyan/datasets/wacv_24/',
174
+ classes=('background', 'person'),
175
+ img_dir='easyportrait_384/images/test',
176
+ ann_dir='easyportrait_384/annotations_ps/test',
177
+ pipeline=[
178
+ dict(type='LoadImageFromFile'),
179
+ dict(
180
+ type='MultiScaleFlipAug',
181
+ img_scale=(384, 384),
182
+ flip=False,
183
+ transforms=[
184
+ dict(
185
+ type='Normalize',
186
+ mean=[143.55267075, 132.96705975, 126.94924335],
187
+ std=[60.2625333, 60.32740275, 59.30988645],
188
+ to_rgb=True),
189
+ dict(type='ImageToTensor', keys=['img']),
190
+ dict(type='Collect', keys=['img'])
191
+ ])
192
+ ]),
193
+ samples_per_gpu=32,
194
+ workers_per_gpu=8)
195
+ log_config = dict(
196
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
197
+ dist_params = dict(backend='nccl')
198
+ log_level = 'INFO'
199
+ load_from = None
200
+ resume_from = None
201
+ workflow = [('train', 1)]
202
+ cudnn_benchmark = True
203
+ optimizer = dict(type='AdamW', lr=0.05, weight_decay=0.0001)
204
+ optimizer_config = dict()
205
+ lr_config = dict(
206
+ policy='poly',
207
+ power=0.9,
208
+ min_lr=0.0,
209
+ by_epoch=True,
210
+ warmup='linear',
211
+ warmup_iters=1000)
212
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
213
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
214
+ checkpoint_config = dict(by_epoch=True, interval=100)
215
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
216
+ work_dir = 'work_dirs/petrova/bisenet-ps/'
217
+ gpu_ids = [0]
218
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/danet-fp/danet-fp.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 2, 4),
11
+ strides=(1, 2, 1, 1),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ decode_head=dict(
17
+ type='DAHead',
18
+ in_channels=2048,
19
+ in_index=3,
20
+ channels=512,
21
+ pam_channels=64,
22
+ dropout_ratio=0.1,
23
+ num_classes=8,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=dict(
29
+ type='FCNHead',
30
+ in_channels=1024,
31
+ in_index=2,
32
+ channels=256,
33
+ num_convs=1,
34
+ concat_input=False,
35
+ dropout_ratio=0.1,
36
+ num_classes=8,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
41
+ train_cfg=dict(),
42
+ test_cfg=dict(mode='whole'))
43
+ dataset_type = 'EasyPortraitFPDataset'
44
+ data_root = '/home/jovyan/datasets/wacv_24/'
45
+ img_norm_cfg = dict(
46
+ mean=[143.55267075, 132.96705975, 126.94924335],
47
+ std=[60.2625333, 60.32740275, 59.30988645],
48
+ to_rgb=True)
49
+ train_pipeline = [
50
+ dict(type='LoadImageFromFile'),
51
+ dict(type='LoadAnnotations'),
52
+ dict(type='RandomFlip', prob=0.0),
53
+ dict(
54
+ type='PhotoMetricDistortion',
55
+ brightness_delta=16,
56
+ contrast_range=(0.5, 1.0),
57
+ saturation_range=(0.5, 1.0),
58
+ hue_delta=5),
59
+ dict(
60
+ type='Normalize',
61
+ mean=[143.55267075, 132.96705975, 126.94924335],
62
+ std=[60.2625333, 60.32740275, 59.30988645],
63
+ to_rgb=True),
64
+ dict(type='DefaultFormatBundle'),
65
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
66
+ ]
67
+ test_pipeline = [
68
+ dict(type='LoadImageFromFile'),
69
+ dict(
70
+ type='MultiScaleFlipAug',
71
+ img_scale=(384, 384),
72
+ flip=False,
73
+ transforms=[
74
+ dict(
75
+ type='Normalize',
76
+ mean=[143.55267075, 132.96705975, 126.94924335],
77
+ std=[60.2625333, 60.32740275, 59.30988645],
78
+ to_rgb=True),
79
+ dict(type='ImageToTensor', keys=['img']),
80
+ dict(type='Collect', keys=['img'])
81
+ ])
82
+ ]
83
+ data = dict(
84
+ train=dict(
85
+ type='EasyPortraitFPDataset',
86
+ data_root='/home/jovyan/datasets/wacv_24/',
87
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
88
+ 'right eye', 'lips', 'teeth'),
89
+ img_dir='easyportrait_384/images/train',
90
+ ann_dir='easyportrait_384/annotations_fp/train',
91
+ pipeline=[
92
+ dict(type='LoadImageFromFile'),
93
+ dict(type='LoadAnnotations'),
94
+ dict(type='RandomFlip', prob=0.0),
95
+ dict(
96
+ type='PhotoMetricDistortion',
97
+ brightness_delta=16,
98
+ contrast_range=(0.5, 1.0),
99
+ saturation_range=(0.5, 1.0),
100
+ hue_delta=5),
101
+ dict(
102
+ type='Normalize',
103
+ mean=[143.55267075, 132.96705975, 126.94924335],
104
+ std=[60.2625333, 60.32740275, 59.30988645],
105
+ to_rgb=True),
106
+ dict(type='DefaultFormatBundle'),
107
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
108
+ ]),
109
+ val=dict(
110
+ type='EasyPortraitFPDataset',
111
+ data_root='/home/jovyan/datasets/wacv_24/',
112
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
113
+ 'right eye', 'lips', 'teeth'),
114
+ img_dir='easyportrait_384/images/val',
115
+ ann_dir='easyportrait_384/annotations_fp/val',
116
+ pipeline=[
117
+ dict(type='LoadImageFromFile'),
118
+ dict(
119
+ type='MultiScaleFlipAug',
120
+ img_scale=(384, 384),
121
+ flip=False,
122
+ transforms=[
123
+ dict(
124
+ type='Normalize',
125
+ mean=[143.55267075, 132.96705975, 126.94924335],
126
+ std=[60.2625333, 60.32740275, 59.30988645],
127
+ to_rgb=True),
128
+ dict(type='ImageToTensor', keys=['img']),
129
+ dict(type='Collect', keys=['img'])
130
+ ])
131
+ ]),
132
+ test=dict(
133
+ type='EasyPortraitFPDataset',
134
+ data_root='/home/jovyan/datasets/wacv_24/',
135
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
136
+ 'right eye', 'lips', 'teeth'),
137
+ img_dir='easyportrait_384/images/test',
138
+ ann_dir='easyportrait_384/annotations_fp/test',
139
+ pipeline=[
140
+ dict(type='LoadImageFromFile'),
141
+ dict(
142
+ type='MultiScaleFlipAug',
143
+ img_scale=(384, 384),
144
+ flip=False,
145
+ transforms=[
146
+ dict(
147
+ type='Normalize',
148
+ mean=[143.55267075, 132.96705975, 126.94924335],
149
+ std=[60.2625333, 60.32740275, 59.30988645],
150
+ to_rgb=True),
151
+ dict(type='ImageToTensor', keys=['img']),
152
+ dict(type='Collect', keys=['img'])
153
+ ])
154
+ ]),
155
+ samples_per_gpu=32,
156
+ workers_per_gpu=8)
157
+ log_config = dict(
158
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
159
+ dist_params = dict(backend='nccl')
160
+ log_level = 'INFO'
161
+ load_from = None
162
+ resume_from = None
163
+ workflow = [('train', 1)]
164
+ cudnn_benchmark = True
165
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
166
+ optimizer_config = dict()
167
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
168
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
169
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
170
+ checkpoint_config = dict(by_epoch=True, interval=100)
171
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
172
+ work_dir = 'work_dirs/petrova/danet-fp'
173
+ gpu_ids = [0]
174
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/danet-ps/danet-ps.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 2, 4),
11
+ strides=(1, 2, 1, 1),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ decode_head=dict(
17
+ type='DAHead',
18
+ in_channels=2048,
19
+ in_index=3,
20
+ channels=512,
21
+ pam_channels=64,
22
+ dropout_ratio=0.1,
23
+ num_classes=2,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=dict(
29
+ type='FCNHead',
30
+ in_channels=1024,
31
+ in_index=2,
32
+ channels=256,
33
+ num_convs=1,
34
+ concat_input=False,
35
+ dropout_ratio=0.1,
36
+ num_classes=2,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
41
+ train_cfg=dict(),
42
+ test_cfg=dict(mode='whole'))
43
+ dataset_type = 'EasyPortraitPSDataset'
44
+ data_root = '/home/jovyan/datasets/wacv_24/'
45
+ img_norm_cfg = dict(
46
+ mean=[143.55267075, 132.96705975, 126.94924335],
47
+ std=[60.2625333, 60.32740275, 59.30988645],
48
+ to_rgb=True)
49
+ train_pipeline = [
50
+ dict(type='LoadImageFromFile'),
51
+ dict(type='LoadAnnotations'),
52
+ dict(type='RandomFlip', prob=0.0),
53
+ dict(
54
+ type='PhotoMetricDistortion',
55
+ brightness_delta=16,
56
+ contrast_range=(0.5, 1.0),
57
+ saturation_range=(0.5, 1.0),
58
+ hue_delta=5),
59
+ dict(
60
+ type='Normalize',
61
+ mean=[143.55267075, 132.96705975, 126.94924335],
62
+ std=[60.2625333, 60.32740275, 59.30988645],
63
+ to_rgb=True),
64
+ dict(type='DefaultFormatBundle'),
65
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
66
+ ]
67
+ test_pipeline = [
68
+ dict(type='LoadImageFromFile'),
69
+ dict(
70
+ type='MultiScaleFlipAug',
71
+ img_scale=(384, 384),
72
+ flip=False,
73
+ transforms=[
74
+ dict(
75
+ type='Normalize',
76
+ mean=[143.55267075, 132.96705975, 126.94924335],
77
+ std=[60.2625333, 60.32740275, 59.30988645],
78
+ to_rgb=True),
79
+ dict(type='ImageToTensor', keys=['img']),
80
+ dict(type='Collect', keys=['img'])
81
+ ])
82
+ ]
83
+ data = dict(
84
+ train=dict(
85
+ type='EasyPortraitPSDataset',
86
+ data_root='/home/jovyan/datasets/wacv_24/',
87
+ classes=('background', 'person'),
88
+ img_dir='easyportrait_384/images/train',
89
+ ann_dir='easyportrait_384/annotations_ps/train',
90
+ pipeline=[
91
+ dict(type='LoadImageFromFile'),
92
+ dict(type='LoadAnnotations'),
93
+ dict(type='RandomFlip', prob=0.0),
94
+ dict(
95
+ type='PhotoMetricDistortion',
96
+ brightness_delta=16,
97
+ contrast_range=(0.5, 1.0),
98
+ saturation_range=(0.5, 1.0),
99
+ hue_delta=5),
100
+ dict(
101
+ type='Normalize',
102
+ mean=[143.55267075, 132.96705975, 126.94924335],
103
+ std=[60.2625333, 60.32740275, 59.30988645],
104
+ to_rgb=True),
105
+ dict(type='DefaultFormatBundle'),
106
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
107
+ ]),
108
+ val=dict(
109
+ type='EasyPortraitPSDataset',
110
+ data_root='/home/jovyan/datasets/wacv_24/',
111
+ classes=('background', 'person'),
112
+ img_dir='easyportrait_384/images/val',
113
+ ann_dir='easyportrait_384/annotations_ps/val',
114
+ pipeline=[
115
+ dict(type='LoadImageFromFile'),
116
+ dict(
117
+ type='MultiScaleFlipAug',
118
+ img_scale=(384, 384),
119
+ flip=False,
120
+ transforms=[
121
+ dict(
122
+ type='Normalize',
123
+ mean=[143.55267075, 132.96705975, 126.94924335],
124
+ std=[60.2625333, 60.32740275, 59.30988645],
125
+ to_rgb=True),
126
+ dict(type='ImageToTensor', keys=['img']),
127
+ dict(type='Collect', keys=['img'])
128
+ ])
129
+ ]),
130
+ test=dict(
131
+ type='EasyPortraitPSDataset',
132
+ data_root='/home/jovyan/datasets/wacv_24/',
133
+ classes=('background', 'person'),
134
+ img_dir='easyportrait_384/images/test',
135
+ ann_dir='easyportrait_384/annotations_ps/test',
136
+ pipeline=[
137
+ dict(type='LoadImageFromFile'),
138
+ dict(
139
+ type='MultiScaleFlipAug',
140
+ img_scale=(384, 384),
141
+ flip=False,
142
+ transforms=[
143
+ dict(
144
+ type='Normalize',
145
+ mean=[143.55267075, 132.96705975, 126.94924335],
146
+ std=[60.2625333, 60.32740275, 59.30988645],
147
+ to_rgb=True),
148
+ dict(type='ImageToTensor', keys=['img']),
149
+ dict(type='Collect', keys=['img'])
150
+ ])
151
+ ]),
152
+ samples_per_gpu=32,
153
+ workers_per_gpu=8)
154
+ log_config = dict(
155
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
156
+ dist_params = dict(backend='nccl')
157
+ log_level = 'INFO'
158
+ load_from = None
159
+ resume_from = None
160
+ workflow = [('train', 1)]
161
+ cudnn_benchmark = True
162
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
163
+ optimizer_config = dict()
164
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
165
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
166
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
167
+ checkpoint_config = dict(by_epoch=True, interval=100)
168
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
169
+ work_dir = 'work_dirs/petrova/danet-ps'
170
+ gpu_ids = [0]
171
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/deeplab-fp/deeplabv3-fp.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 2, 4),
11
+ strides=(1, 2, 1, 1),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ decode_head=dict(
17
+ type='ASPPHead',
18
+ in_channels=2048,
19
+ in_index=3,
20
+ channels=512,
21
+ dilations=(1, 12, 24, 36),
22
+ dropout_ratio=0.1,
23
+ num_classes=8,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=dict(
29
+ type='FCNHead',
30
+ in_channels=1024,
31
+ in_index=2,
32
+ channels=256,
33
+ num_convs=1,
34
+ concat_input=False,
35
+ dropout_ratio=0.1,
36
+ num_classes=8,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
41
+ train_cfg=dict(),
42
+ test_cfg=dict(mode='whole'))
43
+ dataset_type = 'EasyPortraitFPDataset'
44
+ data_root = '/home/jovyan/datasets/wacv_24/'
45
+ img_norm_cfg = dict(
46
+ mean=[143.55267075, 132.96705975, 126.94924335],
47
+ std=[60.2625333, 60.32740275, 59.30988645],
48
+ to_rgb=True)
49
+ train_pipeline = [
50
+ dict(type='LoadImageFromFile'),
51
+ dict(type='LoadAnnotations'),
52
+ dict(type='RandomFlip', prob=0.0),
53
+ dict(
54
+ type='PhotoMetricDistortion',
55
+ brightness_delta=16,
56
+ contrast_range=(0.5, 1.0),
57
+ saturation_range=(0.5, 1.0),
58
+ hue_delta=5),
59
+ dict(
60
+ type='Normalize',
61
+ mean=[143.55267075, 132.96705975, 126.94924335],
62
+ std=[60.2625333, 60.32740275, 59.30988645],
63
+ to_rgb=True),
64
+ dict(type='DefaultFormatBundle'),
65
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
66
+ ]
67
+ test_pipeline = [
68
+ dict(type='LoadImageFromFile'),
69
+ dict(
70
+ type='MultiScaleFlipAug',
71
+ img_scale=(384, 384),
72
+ flip=False,
73
+ transforms=[
74
+ dict(
75
+ type='Normalize',
76
+ mean=[143.55267075, 132.96705975, 126.94924335],
77
+ std=[60.2625333, 60.32740275, 59.30988645],
78
+ to_rgb=True),
79
+ dict(type='ImageToTensor', keys=['img']),
80
+ dict(type='Collect', keys=['img'])
81
+ ])
82
+ ]
83
+ data = dict(
84
+ train=dict(
85
+ type='EasyPortraitFPDataset',
86
+ data_root='/home/jovyan/datasets/wacv_24/',
87
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
88
+ 'right eye', 'lips', 'teeth'),
89
+ img_dir='easyportrait_384/images/train',
90
+ ann_dir='easyportrait_384/annotations_fp/train',
91
+ pipeline=[
92
+ dict(type='LoadImageFromFile'),
93
+ dict(type='LoadAnnotations'),
94
+ dict(type='RandomFlip', prob=0.0),
95
+ dict(
96
+ type='PhotoMetricDistortion',
97
+ brightness_delta=16,
98
+ contrast_range=(0.5, 1.0),
99
+ saturation_range=(0.5, 1.0),
100
+ hue_delta=5),
101
+ dict(
102
+ type='Normalize',
103
+ mean=[143.55267075, 132.96705975, 126.94924335],
104
+ std=[60.2625333, 60.32740275, 59.30988645],
105
+ to_rgb=True),
106
+ dict(type='DefaultFormatBundle'),
107
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
108
+ ]),
109
+ val=dict(
110
+ type='EasyPortraitFPDataset',
111
+ data_root='/home/jovyan/datasets/wacv_24/',
112
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
113
+ 'right eye', 'lips', 'teeth'),
114
+ img_dir='easyportrait_384/images/val',
115
+ ann_dir='easyportrait_384/annotations_fp/val',
116
+ pipeline=[
117
+ dict(type='LoadImageFromFile'),
118
+ dict(
119
+ type='MultiScaleFlipAug',
120
+ img_scale=(384, 384),
121
+ flip=False,
122
+ transforms=[
123
+ dict(
124
+ type='Normalize',
125
+ mean=[143.55267075, 132.96705975, 126.94924335],
126
+ std=[60.2625333, 60.32740275, 59.30988645],
127
+ to_rgb=True),
128
+ dict(type='ImageToTensor', keys=['img']),
129
+ dict(type='Collect', keys=['img'])
130
+ ])
131
+ ]),
132
+ test=dict(
133
+ type='EasyPortraitFPDataset',
134
+ data_root='/home/jovyan/datasets/wacv_24/',
135
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
136
+ 'right eye', 'lips', 'teeth'),
137
+ img_dir='easyportrait_384/images/test',
138
+ ann_dir='easyportrait_384/annotations_fp/test',
139
+ pipeline=[
140
+ dict(type='LoadImageFromFile'),
141
+ dict(
142
+ type='MultiScaleFlipAug',
143
+ img_scale=(384, 384),
144
+ flip=False,
145
+ transforms=[
146
+ dict(
147
+ type='Normalize',
148
+ mean=[143.55267075, 132.96705975, 126.94924335],
149
+ std=[60.2625333, 60.32740275, 59.30988645],
150
+ to_rgb=True),
151
+ dict(type='ImageToTensor', keys=['img']),
152
+ dict(type='Collect', keys=['img'])
153
+ ])
154
+ ]),
155
+ samples_per_gpu=32,
156
+ workers_per_gpu=8)
157
+ log_config = dict(
158
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
159
+ dist_params = dict(backend='nccl')
160
+ log_level = 'INFO'
161
+ load_from = None
162
+ resume_from = None
163
+ workflow = [('train', 1)]
164
+ cudnn_benchmark = True
165
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
166
+ optimizer_config = dict()
167
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
168
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
169
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
170
+ checkpoint_config = dict(by_epoch=True, interval=100)
171
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
172
+ work_dir = 'work_dirs/petrova/deeplabv3-fp'
173
+ gpu_ids = [0]
174
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/deeplab-ps/deeplabv3-ps.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 2, 4),
11
+ strides=(1, 2, 1, 1),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ decode_head=dict(
17
+ type='ASPPHead',
18
+ in_channels=2048,
19
+ in_index=3,
20
+ channels=512,
21
+ dilations=(1, 12, 24, 36),
22
+ dropout_ratio=0.1,
23
+ num_classes=2,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=dict(
29
+ type='FCNHead',
30
+ in_channels=1024,
31
+ in_index=2,
32
+ channels=256,
33
+ num_convs=1,
34
+ concat_input=False,
35
+ dropout_ratio=0.1,
36
+ num_classes=2,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
41
+ train_cfg=dict(),
42
+ test_cfg=dict(mode='whole'))
43
+ dataset_type = 'EasyPortraitPSDataset'
44
+ data_root = '/home/jovyan/datasets/wacv_24/'
45
+ img_norm_cfg = dict(
46
+ mean=[143.55267075, 132.96705975, 126.94924335],
47
+ std=[60.2625333, 60.32740275, 59.30988645],
48
+ to_rgb=True)
49
+ train_pipeline = [
50
+ dict(type='LoadImageFromFile'),
51
+ dict(type='LoadAnnotations'),
52
+ dict(type='RandomFlip', prob=0.0),
53
+ dict(
54
+ type='PhotoMetricDistortion',
55
+ brightness_delta=16,
56
+ contrast_range=(0.5, 1.0),
57
+ saturation_range=(0.5, 1.0),
58
+ hue_delta=5),
59
+ dict(
60
+ type='Normalize',
61
+ mean=[143.55267075, 132.96705975, 126.94924335],
62
+ std=[60.2625333, 60.32740275, 59.30988645],
63
+ to_rgb=True),
64
+ dict(type='DefaultFormatBundle'),
65
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
66
+ ]
67
+ test_pipeline = [
68
+ dict(type='LoadImageFromFile'),
69
+ dict(
70
+ type='MultiScaleFlipAug',
71
+ img_scale=(384, 384),
72
+ flip=False,
73
+ transforms=[
74
+ dict(
75
+ type='Normalize',
76
+ mean=[143.55267075, 132.96705975, 126.94924335],
77
+ std=[60.2625333, 60.32740275, 59.30988645],
78
+ to_rgb=True),
79
+ dict(type='ImageToTensor', keys=['img']),
80
+ dict(type='Collect', keys=['img'])
81
+ ])
82
+ ]
83
+ data = dict(
84
+ train=dict(
85
+ type='EasyPortraitPSDataset',
86
+ data_root='/home/jovyan/datasets/wacv_24/',
87
+ classes=('background', 'person'),
88
+ img_dir='easyportrait_384/images/train',
89
+ ann_dir='easyportrait_384/annotations_ps/train',
90
+ pipeline=[
91
+ dict(type='LoadImageFromFile'),
92
+ dict(type='LoadAnnotations'),
93
+ dict(type='RandomFlip', prob=0.0),
94
+ dict(
95
+ type='PhotoMetricDistortion',
96
+ brightness_delta=16,
97
+ contrast_range=(0.5, 1.0),
98
+ saturation_range=(0.5, 1.0),
99
+ hue_delta=5),
100
+ dict(
101
+ type='Normalize',
102
+ mean=[143.55267075, 132.96705975, 126.94924335],
103
+ std=[60.2625333, 60.32740275, 59.30988645],
104
+ to_rgb=True),
105
+ dict(type='DefaultFormatBundle'),
106
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
107
+ ]),
108
+ val=dict(
109
+ type='EasyPortraitPSDataset',
110
+ data_root='/home/jovyan/datasets/wacv_24/',
111
+ classes=('background', 'person'),
112
+ img_dir='easyportrait_384/images/val',
113
+ ann_dir='easyportrait_384/annotations_ps/val',
114
+ pipeline=[
115
+ dict(type='LoadImageFromFile'),
116
+ dict(
117
+ type='MultiScaleFlipAug',
118
+ img_scale=(384, 384),
119
+ flip=False,
120
+ transforms=[
121
+ dict(
122
+ type='Normalize',
123
+ mean=[143.55267075, 132.96705975, 126.94924335],
124
+ std=[60.2625333, 60.32740275, 59.30988645],
125
+ to_rgb=True),
126
+ dict(type='ImageToTensor', keys=['img']),
127
+ dict(type='Collect', keys=['img'])
128
+ ])
129
+ ]),
130
+ test=dict(
131
+ type='EasyPortraitPSDataset',
132
+ data_root='/home/jovyan/datasets/wacv_24/',
133
+ classes=('background', 'person'),
134
+ img_dir='easyportrait_384/images/test',
135
+ ann_dir='easyportrait_384/annotations_ps/test',
136
+ pipeline=[
137
+ dict(type='LoadImageFromFile'),
138
+ dict(
139
+ type='MultiScaleFlipAug',
140
+ img_scale=(384, 384),
141
+ flip=False,
142
+ transforms=[
143
+ dict(
144
+ type='Normalize',
145
+ mean=[143.55267075, 132.96705975, 126.94924335],
146
+ std=[60.2625333, 60.32740275, 59.30988645],
147
+ to_rgb=True),
148
+ dict(type='ImageToTensor', keys=['img']),
149
+ dict(type='Collect', keys=['img'])
150
+ ])
151
+ ]),
152
+ samples_per_gpu=32,
153
+ workers_per_gpu=8)
154
+ log_config = dict(
155
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
156
+ dist_params = dict(backend='nccl')
157
+ log_level = 'INFO'
158
+ load_from = None
159
+ resume_from = None
160
+ workflow = [('train', 1)]
161
+ cudnn_benchmark = True
162
+ optimizer = dict(type='AdamW', lr=0.0002, weight_decay=0.0001)
163
+ optimizer_config = dict()
164
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
165
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
166
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
167
+ checkpoint_config = dict(by_epoch=True, interval=100)
168
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
169
+ work_dir = 'work_dirs/petrova/deeplabv3-ps'
170
+ gpu_ids = [0]
171
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fastscnn-fp/fastscnn-fp.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ backbone=dict(
5
+ type='FastSCNN',
6
+ downsample_dw_channels=(32, 48),
7
+ global_in_channels=64,
8
+ global_block_channels=(64, 96, 128),
9
+ global_block_strides=(2, 2, 1),
10
+ global_out_channels=128,
11
+ higher_in_channels=64,
12
+ lower_in_channels=128,
13
+ fusion_out_channels=128,
14
+ out_indices=(0, 1, 2),
15
+ norm_cfg=dict(type='SyncBN', requires_grad=True, momentum=0.01),
16
+ align_corners=False),
17
+ decode_head=dict(
18
+ type='DepthwiseSeparableFCNHead',
19
+ in_channels=128,
20
+ channels=128,
21
+ concat_input=False,
22
+ num_classes=8,
23
+ in_index=-1,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True, momentum=0.01),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)),
28
+ auxiliary_head=[
29
+ dict(type='FCNHead', in_channels=128, channels=32, num_classes=8),
30
+ dict(type='FCNHead', in_channels=128, channels=32, num_classes=8)
31
+ ],
32
+ train_cfg=dict(),
33
+ test_cfg=dict(mode='whole'))
34
+ dataset_type = 'EasyPortraitFPDataset'
35
+ data_root = '/home/jovyan/datasets/wacv_24/'
36
+ img_norm_cfg = dict(
37
+ mean=[143.55267075, 132.96705975, 126.94924335],
38
+ std=[60.2625333, 60.32740275, 59.30988645],
39
+ to_rgb=True)
40
+ train_pipeline = [
41
+ dict(type='LoadImageFromFile'),
42
+ dict(type='LoadAnnotations'),
43
+ dict(type='RandomFlip', prob=0.0),
44
+ dict(
45
+ type='PhotoMetricDistortion',
46
+ brightness_delta=16,
47
+ contrast_range=(0.5, 1.0),
48
+ saturation_range=(0.5, 1.0),
49
+ hue_delta=5),
50
+ dict(
51
+ type='Normalize',
52
+ mean=[143.55267075, 132.96705975, 126.94924335],
53
+ std=[60.2625333, 60.32740275, 59.30988645],
54
+ to_rgb=True),
55
+ dict(type='DefaultFormatBundle'),
56
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
57
+ ]
58
+ test_pipeline = [
59
+ dict(type='LoadImageFromFile'),
60
+ dict(
61
+ type='MultiScaleFlipAug',
62
+ img_scale=(384, 384),
63
+ flip=False,
64
+ transforms=[
65
+ dict(
66
+ type='Normalize',
67
+ mean=[143.55267075, 132.96705975, 126.94924335],
68
+ std=[60.2625333, 60.32740275, 59.30988645],
69
+ to_rgb=True),
70
+ dict(type='ImageToTensor', keys=['img']),
71
+ dict(type='Collect', keys=['img'])
72
+ ])
73
+ ]
74
+ data = dict(
75
+ train=dict(
76
+ type='EasyPortraitFPDataset',
77
+ data_root='/home/jovyan/datasets/wacv_24/',
78
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
79
+ 'right eye', 'lips', 'teeth'),
80
+ img_dir='easyportrait_384/images/train',
81
+ ann_dir='easyportrait_384/annotations_fp/train',
82
+ pipeline=[
83
+ dict(type='LoadImageFromFile'),
84
+ dict(type='LoadAnnotations'),
85
+ dict(type='RandomFlip', prob=0.0),
86
+ dict(
87
+ type='PhotoMetricDistortion',
88
+ brightness_delta=16,
89
+ contrast_range=(0.5, 1.0),
90
+ saturation_range=(0.5, 1.0),
91
+ hue_delta=5),
92
+ dict(
93
+ type='Normalize',
94
+ mean=[143.55267075, 132.96705975, 126.94924335],
95
+ std=[60.2625333, 60.32740275, 59.30988645],
96
+ to_rgb=True),
97
+ dict(type='DefaultFormatBundle'),
98
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
99
+ ]),
100
+ val=dict(
101
+ type='EasyPortraitFPDataset',
102
+ data_root='/home/jovyan/datasets/wacv_24/',
103
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
104
+ 'right eye', 'lips', 'teeth'),
105
+ img_dir='easyportrait_384/images/val',
106
+ ann_dir='easyportrait_384/annotations_fp/val',
107
+ pipeline=[
108
+ dict(type='LoadImageFromFile'),
109
+ dict(
110
+ type='MultiScaleFlipAug',
111
+ img_scale=(384, 384),
112
+ flip=False,
113
+ transforms=[
114
+ dict(
115
+ type='Normalize',
116
+ mean=[143.55267075, 132.96705975, 126.94924335],
117
+ std=[60.2625333, 60.32740275, 59.30988645],
118
+ to_rgb=True),
119
+ dict(type='ImageToTensor', keys=['img']),
120
+ dict(type='Collect', keys=['img'])
121
+ ])
122
+ ]),
123
+ test=dict(
124
+ type='EasyPortraitFPDataset',
125
+ data_root='/home/jovyan/datasets/wacv_24/',
126
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
127
+ 'right eye', 'lips', 'teeth'),
128
+ img_dir='easyportrait_384/images/test',
129
+ ann_dir='easyportrait_384/annotations_fp/test',
130
+ pipeline=[
131
+ dict(type='LoadImageFromFile'),
132
+ dict(
133
+ type='MultiScaleFlipAug',
134
+ img_scale=(384, 384),
135
+ flip=False,
136
+ transforms=[
137
+ dict(
138
+ type='Normalize',
139
+ mean=[143.55267075, 132.96705975, 126.94924335],
140
+ std=[60.2625333, 60.32740275, 59.30988645],
141
+ to_rgb=True),
142
+ dict(type='ImageToTensor', keys=['img']),
143
+ dict(type='Collect', keys=['img'])
144
+ ])
145
+ ]),
146
+ samples_per_gpu=32,
147
+ workers_per_gpu=8)
148
+ log_config = dict(
149
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
150
+ dist_params = dict(backend='nccl')
151
+ log_level = 'INFO'
152
+ load_from = None
153
+ resume_from = None
154
+ workflow = [('train', 1)]
155
+ cudnn_benchmark = True
156
+ optimizer = dict(type='SGD', lr=0.12, weight_decay=4e-05, momentum=0.9)
157
+ optimizer_config = dict()
158
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
159
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
160
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
161
+ checkpoint_config = dict(by_epoch=True, interval=100)
162
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
163
+ work_dir = 'work_dirs/petrova/fast_scnn-fp'
164
+ gpu_ids = [0]
165
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fastscnn-ps/fastscnn-ps.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ backbone=dict(
5
+ type='FastSCNN',
6
+ downsample_dw_channels=(32, 48),
7
+ global_in_channels=64,
8
+ global_block_channels=(64, 96, 128),
9
+ global_block_strides=(2, 2, 1),
10
+ global_out_channels=128,
11
+ higher_in_channels=64,
12
+ lower_in_channels=128,
13
+ fusion_out_channels=128,
14
+ out_indices=(0, 1, 2),
15
+ norm_cfg=dict(type='SyncBN', requires_grad=True, momentum=0.01),
16
+ align_corners=False),
17
+ decode_head=dict(
18
+ type='DepthwiseSeparableFCNHead',
19
+ in_channels=128,
20
+ channels=128,
21
+ concat_input=False,
22
+ num_classes=2,
23
+ in_index=-1,
24
+ norm_cfg=dict(type='SyncBN', requires_grad=True, momentum=0.01),
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)),
28
+ auxiliary_head=[
29
+ dict(type='FCNHead', in_channels=128, channels=32, num_classes=2),
30
+ dict(type='FCNHead', in_channels=128, channels=32, num_classes=2)
31
+ ],
32
+ train_cfg=dict(),
33
+ test_cfg=dict(mode='whole'))
34
+ dataset_type = 'EasyPortraitPSDataset'
35
+ data_root = '/home/jovyan/datasets/wacv_24/'
36
+ img_norm_cfg = dict(
37
+ mean=[143.55267075, 132.96705975, 126.94924335],
38
+ std=[60.2625333, 60.32740275, 59.30988645],
39
+ to_rgb=True)
40
+ train_pipeline = [
41
+ dict(type='LoadImageFromFile'),
42
+ dict(type='LoadAnnotations'),
43
+ dict(type='RandomFlip', prob=0.0),
44
+ dict(
45
+ type='PhotoMetricDistortion',
46
+ brightness_delta=16,
47
+ contrast_range=(0.5, 1.0),
48
+ saturation_range=(0.5, 1.0),
49
+ hue_delta=5),
50
+ dict(
51
+ type='Normalize',
52
+ mean=[143.55267075, 132.96705975, 126.94924335],
53
+ std=[60.2625333, 60.32740275, 59.30988645],
54
+ to_rgb=True),
55
+ dict(type='DefaultFormatBundle'),
56
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
57
+ ]
58
+ test_pipeline = [
59
+ dict(type='LoadImageFromFile'),
60
+ dict(
61
+ type='MultiScaleFlipAug',
62
+ img_scale=(384, 384),
63
+ flip=False,
64
+ transforms=[
65
+ dict(
66
+ type='Normalize',
67
+ mean=[143.55267075, 132.96705975, 126.94924335],
68
+ std=[60.2625333, 60.32740275, 59.30988645],
69
+ to_rgb=True),
70
+ dict(type='ImageToTensor', keys=['img']),
71
+ dict(type='Collect', keys=['img'])
72
+ ])
73
+ ]
74
+ data = dict(
75
+ train=dict(
76
+ type='EasyPortraitPSDataset',
77
+ data_root='/home/jovyan/datasets/wacv_24/',
78
+ classes=('background', 'person'),
79
+ img_dir='easyportrait_384/images/train',
80
+ ann_dir='easyportrait_384/annotations_ps/train',
81
+ pipeline=[
82
+ dict(type='LoadImageFromFile'),
83
+ dict(type='LoadAnnotations'),
84
+ dict(type='RandomFlip', prob=0.0),
85
+ dict(
86
+ type='PhotoMetricDistortion',
87
+ brightness_delta=16,
88
+ contrast_range=(0.5, 1.0),
89
+ saturation_range=(0.5, 1.0),
90
+ hue_delta=5),
91
+ dict(
92
+ type='Normalize',
93
+ mean=[143.55267075, 132.96705975, 126.94924335],
94
+ std=[60.2625333, 60.32740275, 59.30988645],
95
+ to_rgb=True),
96
+ dict(type='DefaultFormatBundle'),
97
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
98
+ ]),
99
+ val=dict(
100
+ type='EasyPortraitPSDataset',
101
+ data_root='/home/jovyan/datasets/wacv_24/',
102
+ classes=('background', 'person'),
103
+ img_dir='easyportrait_384/images/val',
104
+ ann_dir='easyportrait_384/annotations_ps/val',
105
+ pipeline=[
106
+ dict(type='LoadImageFromFile'),
107
+ dict(
108
+ type='MultiScaleFlipAug',
109
+ img_scale=(384, 384),
110
+ flip=False,
111
+ transforms=[
112
+ dict(
113
+ type='Normalize',
114
+ mean=[143.55267075, 132.96705975, 126.94924335],
115
+ std=[60.2625333, 60.32740275, 59.30988645],
116
+ to_rgb=True),
117
+ dict(type='ImageToTensor', keys=['img']),
118
+ dict(type='Collect', keys=['img'])
119
+ ])
120
+ ]),
121
+ test=dict(
122
+ type='EasyPortraitPSDataset',
123
+ data_root='/home/jovyan/datasets/wacv_24/',
124
+ classes=('background', 'person'),
125
+ img_dir='easyportrait_384/images/test',
126
+ ann_dir='easyportrait_384/annotations_ps/test',
127
+ pipeline=[
128
+ dict(type='LoadImageFromFile'),
129
+ dict(
130
+ type='MultiScaleFlipAug',
131
+ img_scale=(384, 384),
132
+ flip=False,
133
+ transforms=[
134
+ dict(
135
+ type='Normalize',
136
+ mean=[143.55267075, 132.96705975, 126.94924335],
137
+ std=[60.2625333, 60.32740275, 59.30988645],
138
+ to_rgb=True),
139
+ dict(type='ImageToTensor', keys=['img']),
140
+ dict(type='Collect', keys=['img'])
141
+ ])
142
+ ]),
143
+ samples_per_gpu=32,
144
+ workers_per_gpu=8)
145
+ log_config = dict(
146
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
147
+ dist_params = dict(backend='nccl')
148
+ log_level = 'INFO'
149
+ load_from = None
150
+ resume_from = None
151
+ workflow = [('train', 1)]
152
+ cudnn_benchmark = True
153
+ optimizer = dict(type='SGD', lr=0.12, weight_decay=4e-05, momentum=0.9)
154
+ optimizer_config = dict()
155
+ lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=True)
156
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
157
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
158
+ checkpoint_config = dict(by_epoch=True, interval=100)
159
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
160
+ work_dir = 'work_dirs/petrova/fast_scnn-ps'
161
+ gpu_ids = [0]
162
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fcn-fp/fcn-fp.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='mmcls://mobilenet_v2',
5
+ backbone=dict(
6
+ type='MobileNetV2',
7
+ widen_factor=1.0,
8
+ strides=(1, 2, 2, 1, 1, 1, 1),
9
+ dilations=(1, 1, 1, 2, 2, 4, 4),
10
+ out_indices=(1, 2, 4, 6),
11
+ norm_cfg=dict(type='SyncBN', requires_grad=True)),
12
+ decode_head=dict(
13
+ type='FCNHead',
14
+ in_channels=320,
15
+ in_index=3,
16
+ channels=512,
17
+ num_convs=2,
18
+ concat_input=True,
19
+ dropout_ratio=0.1,
20
+ num_classes=8,
21
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
22
+ align_corners=False,
23
+ loss_decode=dict(
24
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
25
+ auxiliary_head=dict(
26
+ type='FCNHead',
27
+ in_channels=96,
28
+ in_index=2,
29
+ channels=256,
30
+ num_convs=1,
31
+ concat_input=False,
32
+ dropout_ratio=0.1,
33
+ num_classes=8,
34
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
35
+ align_corners=False,
36
+ loss_decode=dict(
37
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
38
+ train_cfg=dict(),
39
+ test_cfg=dict(mode='whole'))
40
+ dataset_type = 'EasyPortraitFPDataset'
41
+ data_root = '/home/jovyan/datasets/wacv_24/'
42
+ img_norm_cfg = dict(
43
+ mean=[143.55267075, 132.96705975, 126.94924335],
44
+ std=[60.2625333, 60.32740275, 59.30988645],
45
+ to_rgb=True)
46
+ train_pipeline = [
47
+ dict(type='LoadImageFromFile'),
48
+ dict(type='LoadAnnotations'),
49
+ dict(type='RandomFlip', prob=0.0),
50
+ dict(
51
+ type='PhotoMetricDistortion',
52
+ brightness_delta=16,
53
+ contrast_range=(0.5, 1.0),
54
+ saturation_range=(0.5, 1.0),
55
+ hue_delta=5),
56
+ dict(
57
+ type='Normalize',
58
+ mean=[143.55267075, 132.96705975, 126.94924335],
59
+ std=[60.2625333, 60.32740275, 59.30988645],
60
+ to_rgb=True),
61
+ dict(type='DefaultFormatBundle'),
62
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
63
+ ]
64
+ test_pipeline = [
65
+ dict(type='LoadImageFromFile'),
66
+ dict(
67
+ type='MultiScaleFlipAug',
68
+ img_scale=(384, 384),
69
+ flip=False,
70
+ transforms=[
71
+ dict(
72
+ type='Normalize',
73
+ mean=[143.55267075, 132.96705975, 126.94924335],
74
+ std=[60.2625333, 60.32740275, 59.30988645],
75
+ to_rgb=True),
76
+ dict(type='ImageToTensor', keys=['img']),
77
+ dict(type='Collect', keys=['img'])
78
+ ])
79
+ ]
80
+ data = dict(
81
+ train=dict(
82
+ type='EasyPortraitFPDataset',
83
+ data_root='/home/jovyan/datasets/wacv_24/',
84
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
85
+ 'right eye', 'lips', 'teeth'),
86
+ img_dir='easyportrait_384/images/train',
87
+ ann_dir='easyportrait_384/annotations_fp/train',
88
+ pipeline=[
89
+ dict(type='LoadImageFromFile'),
90
+ dict(type='LoadAnnotations'),
91
+ dict(type='RandomFlip', prob=0.0),
92
+ dict(
93
+ type='PhotoMetricDistortion',
94
+ brightness_delta=16,
95
+ contrast_range=(0.5, 1.0),
96
+ saturation_range=(0.5, 1.0),
97
+ hue_delta=5),
98
+ dict(
99
+ type='Normalize',
100
+ mean=[143.55267075, 132.96705975, 126.94924335],
101
+ std=[60.2625333, 60.32740275, 59.30988645],
102
+ to_rgb=True),
103
+ dict(type='DefaultFormatBundle'),
104
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
105
+ ]),
106
+ val=dict(
107
+ type='EasyPortraitFPDataset',
108
+ data_root='/home/jovyan/datasets/wacv_24/',
109
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
110
+ 'right eye', 'lips', 'teeth'),
111
+ img_dir='easyportrait_384/images/val',
112
+ ann_dir='easyportrait_384/annotations_fp/val',
113
+ pipeline=[
114
+ dict(type='LoadImageFromFile'),
115
+ dict(
116
+ type='MultiScaleFlipAug',
117
+ img_scale=(384, 384),
118
+ flip=False,
119
+ transforms=[
120
+ dict(
121
+ type='Normalize',
122
+ mean=[143.55267075, 132.96705975, 126.94924335],
123
+ std=[60.2625333, 60.32740275, 59.30988645],
124
+ to_rgb=True),
125
+ dict(type='ImageToTensor', keys=['img']),
126
+ dict(type='Collect', keys=['img'])
127
+ ])
128
+ ]),
129
+ test=dict(
130
+ type='EasyPortraitFPDataset',
131
+ data_root='/home/jovyan/datasets/wacv_24/',
132
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
133
+ 'right eye', 'lips', 'teeth'),
134
+ img_dir='easyportrait_384/images/test',
135
+ ann_dir='easyportrait_384/annotations_fp/test',
136
+ pipeline=[
137
+ dict(type='LoadImageFromFile'),
138
+ dict(
139
+ type='MultiScaleFlipAug',
140
+ img_scale=(384, 384),
141
+ flip=False,
142
+ transforms=[
143
+ dict(
144
+ type='Normalize',
145
+ mean=[143.55267075, 132.96705975, 126.94924335],
146
+ std=[60.2625333, 60.32740275, 59.30988645],
147
+ to_rgb=True),
148
+ dict(type='ImageToTensor', keys=['img']),
149
+ dict(type='Collect', keys=['img'])
150
+ ])
151
+ ]),
152
+ samples_per_gpu=32,
153
+ workers_per_gpu=8)
154
+ log_config = dict(
155
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
156
+ dist_params = dict(backend='nccl')
157
+ log_level = 'INFO'
158
+ load_from = None
159
+ resume_from = None
160
+ workflow = [('train', 1)]
161
+ cudnn_benchmark = True
162
+ optimizer = dict(
163
+ type='AdamW',
164
+ lr=6e-05,
165
+ betas=(0.9, 0.999),
166
+ weight_decay=0.01,
167
+ paramwise_cfg=dict(
168
+ custom_keys=dict(
169
+ pos_block=dict(decay_mult=0.0),
170
+ norm=dict(decay_mult=0.0),
171
+ head=dict(lr_mult=10.0))))
172
+ optimizer_config = dict()
173
+ lr_config = dict(
174
+ policy='poly',
175
+ warmup='linear',
176
+ warmup_iters=1500,
177
+ warmup_ratio=1e-06,
178
+ power=1.0,
179
+ min_lr=0.0,
180
+ by_epoch=False)
181
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
182
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
183
+ checkpoint_config = dict(by_epoch=True, interval=100)
184
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
185
+ work_dir = 'work_dirs/petrova/fcn-fp'
186
+ gpu_ids = [0]
187
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fcn-ps/fcn-ps.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='mmcls://mobilenet_v2',
5
+ backbone=dict(
6
+ type='MobileNetV2',
7
+ widen_factor=1.0,
8
+ strides=(1, 2, 2, 1, 1, 1, 1),
9
+ dilations=(1, 1, 1, 2, 2, 4, 4),
10
+ out_indices=(1, 2, 4, 6),
11
+ norm_cfg=dict(type='SyncBN', requires_grad=True)),
12
+ decode_head=dict(
13
+ type='FCNHead',
14
+ in_channels=320,
15
+ in_index=3,
16
+ channels=512,
17
+ num_convs=2,
18
+ concat_input=True,
19
+ dropout_ratio=0.1,
20
+ num_classes=2,
21
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
22
+ align_corners=False,
23
+ loss_decode=dict(
24
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
25
+ auxiliary_head=dict(
26
+ type='FCNHead',
27
+ in_channels=96,
28
+ in_index=2,
29
+ channels=256,
30
+ num_convs=1,
31
+ concat_input=False,
32
+ dropout_ratio=0.1,
33
+ num_classes=2,
34
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
35
+ align_corners=False,
36
+ loss_decode=dict(
37
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
38
+ train_cfg=dict(),
39
+ test_cfg=dict(mode='whole'))
40
+ dataset_type = 'EasyPortraitPSDataset'
41
+ data_root = '/home/jovyan/datasets/wacv_24/'
42
+ img_norm_cfg = dict(
43
+ mean=[143.55267075, 132.96705975, 126.94924335],
44
+ std=[60.2625333, 60.32740275, 59.30988645],
45
+ to_rgb=True)
46
+ train_pipeline = [
47
+ dict(type='LoadImageFromFile'),
48
+ dict(type='LoadAnnotations'),
49
+ dict(type='RandomFlip', prob=0.0),
50
+ dict(
51
+ type='PhotoMetricDistortion',
52
+ brightness_delta=16,
53
+ contrast_range=(0.5, 1.0),
54
+ saturation_range=(0.5, 1.0),
55
+ hue_delta=5),
56
+ dict(
57
+ type='Normalize',
58
+ mean=[143.55267075, 132.96705975, 126.94924335],
59
+ std=[60.2625333, 60.32740275, 59.30988645],
60
+ to_rgb=True),
61
+ dict(type='DefaultFormatBundle'),
62
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
63
+ ]
64
+ test_pipeline = [
65
+ dict(type='LoadImageFromFile'),
66
+ dict(
67
+ type='MultiScaleFlipAug',
68
+ img_scale=(384, 384),
69
+ flip=False,
70
+ transforms=[
71
+ dict(
72
+ type='Normalize',
73
+ mean=[143.55267075, 132.96705975, 126.94924335],
74
+ std=[60.2625333, 60.32740275, 59.30988645],
75
+ to_rgb=True),
76
+ dict(type='ImageToTensor', keys=['img']),
77
+ dict(type='Collect', keys=['img'])
78
+ ])
79
+ ]
80
+ data = dict(
81
+ train=dict(
82
+ type='EasyPortraitPSDataset',
83
+ data_root='/home/jovyan/datasets/wacv_24/',
84
+ classes=('background', 'person'),
85
+ img_dir='easyportrait_384/images/train',
86
+ ann_dir='easyportrait_384/annotations_ps/train',
87
+ pipeline=[
88
+ dict(type='LoadImageFromFile'),
89
+ dict(type='LoadAnnotations'),
90
+ dict(type='RandomFlip', prob=0.0),
91
+ dict(
92
+ type='PhotoMetricDistortion',
93
+ brightness_delta=16,
94
+ contrast_range=(0.5, 1.0),
95
+ saturation_range=(0.5, 1.0),
96
+ hue_delta=5),
97
+ dict(
98
+ type='Normalize',
99
+ mean=[143.55267075, 132.96705975, 126.94924335],
100
+ std=[60.2625333, 60.32740275, 59.30988645],
101
+ to_rgb=True),
102
+ dict(type='DefaultFormatBundle'),
103
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
104
+ ]),
105
+ val=dict(
106
+ type='EasyPortraitPSDataset',
107
+ data_root='/home/jovyan/datasets/wacv_24/',
108
+ classes=('background', 'person'),
109
+ img_dir='easyportrait_384/images/val',
110
+ ann_dir='easyportrait_384/annotations_ps/val',
111
+ pipeline=[
112
+ dict(type='LoadImageFromFile'),
113
+ dict(
114
+ type='MultiScaleFlipAug',
115
+ img_scale=(384, 384),
116
+ flip=False,
117
+ transforms=[
118
+ dict(
119
+ type='Normalize',
120
+ mean=[143.55267075, 132.96705975, 126.94924335],
121
+ std=[60.2625333, 60.32740275, 59.30988645],
122
+ to_rgb=True),
123
+ dict(type='ImageToTensor', keys=['img']),
124
+ dict(type='Collect', keys=['img'])
125
+ ])
126
+ ]),
127
+ test=dict(
128
+ type='EasyPortraitPSDataset',
129
+ data_root='/home/jovyan/datasets/wacv_24/',
130
+ classes=('background', 'person'),
131
+ img_dir='easyportrait_384/images/test',
132
+ ann_dir='easyportrait_384/annotations_ps/test',
133
+ pipeline=[
134
+ dict(type='LoadImageFromFile'),
135
+ dict(
136
+ type='MultiScaleFlipAug',
137
+ img_scale=(384, 384),
138
+ flip=False,
139
+ transforms=[
140
+ dict(
141
+ type='Normalize',
142
+ mean=[143.55267075, 132.96705975, 126.94924335],
143
+ std=[60.2625333, 60.32740275, 59.30988645],
144
+ to_rgb=True),
145
+ dict(type='ImageToTensor', keys=['img']),
146
+ dict(type='Collect', keys=['img'])
147
+ ])
148
+ ]),
149
+ samples_per_gpu=32,
150
+ workers_per_gpu=8)
151
+ log_config = dict(
152
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
153
+ dist_params = dict(backend='nccl')
154
+ log_level = 'INFO'
155
+ load_from = None
156
+ resume_from = None
157
+ workflow = [('train', 1)]
158
+ cudnn_benchmark = True
159
+ optimizer = dict(
160
+ type='AdamW',
161
+ lr=6e-05,
162
+ betas=(0.9, 0.999),
163
+ weight_decay=0.01,
164
+ paramwise_cfg=dict(
165
+ custom_keys=dict(
166
+ pos_block=dict(decay_mult=0.0),
167
+ norm=dict(decay_mult=0.0),
168
+ head=dict(lr_mult=10.0))))
169
+ optimizer_config = dict()
170
+ lr_config = dict(
171
+ policy='poly',
172
+ warmup='linear',
173
+ warmup_iters=1500,
174
+ warmup_ratio=1e-06,
175
+ power=1.0,
176
+ min_lr=0.0,
177
+ by_epoch=False)
178
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
179
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
180
+ checkpoint_config = dict(by_epoch=True, interval=100)
181
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
182
+ work_dir = 'work_dirs/petrova/fcn-ps'
183
+ gpu_ids = [0]
184
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fpn-fp/fpn-fp.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 1, 1),
11
+ strides=(1, 2, 2, 2),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ neck=dict(
17
+ type='FPN',
18
+ in_channels=[256, 512, 1024, 2048],
19
+ out_channels=256,
20
+ num_outs=4),
21
+ decode_head=dict(
22
+ type='FPNHead',
23
+ in_channels=[256, 256, 256, 256],
24
+ in_index=[0, 1, 2, 3],
25
+ feature_strides=[4, 8, 16, 32],
26
+ channels=128,
27
+ dropout_ratio=0.1,
28
+ num_classes=8,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ align_corners=False,
31
+ loss_decode=dict(
32
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
33
+ train_cfg=dict(),
34
+ test_cfg=dict(mode='whole'))
35
+ dataset_type = 'EasyPortraitFPDataset'
36
+ data_root = '/home/jovyan/datasets/wacv_24/'
37
+ img_norm_cfg = dict(
38
+ mean=[143.55267075, 132.96705975, 126.94924335],
39
+ std=[60.2625333, 60.32740275, 59.30988645],
40
+ to_rgb=True)
41
+ train_pipeline = [
42
+ dict(type='LoadImageFromFile'),
43
+ dict(type='LoadAnnotations'),
44
+ dict(type='RandomFlip', prob=0.0),
45
+ dict(
46
+ type='PhotoMetricDistortion',
47
+ brightness_delta=16,
48
+ contrast_range=(0.5, 1.0),
49
+ saturation_range=(0.5, 1.0),
50
+ hue_delta=5),
51
+ dict(
52
+ type='Normalize',
53
+ mean=[143.55267075, 132.96705975, 126.94924335],
54
+ std=[60.2625333, 60.32740275, 59.30988645],
55
+ to_rgb=True),
56
+ dict(type='DefaultFormatBundle'),
57
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
58
+ ]
59
+ test_pipeline = [
60
+ dict(type='LoadImageFromFile'),
61
+ dict(
62
+ type='MultiScaleFlipAug',
63
+ img_scale=(384, 384),
64
+ flip=False,
65
+ transforms=[
66
+ dict(
67
+ type='Normalize',
68
+ mean=[143.55267075, 132.96705975, 126.94924335],
69
+ std=[60.2625333, 60.32740275, 59.30988645],
70
+ to_rgb=True),
71
+ dict(type='ImageToTensor', keys=['img']),
72
+ dict(type='Collect', keys=['img'])
73
+ ])
74
+ ]
75
+ data = dict(
76
+ train=dict(
77
+ type='EasyPortraitFPDataset',
78
+ data_root='/home/jovyan/datasets/wacv_24/',
79
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
80
+ 'right eye', 'lips', 'teeth'),
81
+ img_dir='easyportrait_384/images/train',
82
+ ann_dir='easyportrait_384/annotations_fp/train',
83
+ pipeline=[
84
+ dict(type='LoadImageFromFile'),
85
+ dict(type='LoadAnnotations'),
86
+ dict(type='RandomFlip', prob=0.0),
87
+ dict(
88
+ type='PhotoMetricDistortion',
89
+ brightness_delta=16,
90
+ contrast_range=(0.5, 1.0),
91
+ saturation_range=(0.5, 1.0),
92
+ hue_delta=5),
93
+ dict(
94
+ type='Normalize',
95
+ mean=[143.55267075, 132.96705975, 126.94924335],
96
+ std=[60.2625333, 60.32740275, 59.30988645],
97
+ to_rgb=True),
98
+ dict(type='DefaultFormatBundle'),
99
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
100
+ ]),
101
+ val=dict(
102
+ type='EasyPortraitFPDataset',
103
+ data_root='/home/jovyan/datasets/wacv_24/',
104
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
105
+ 'right eye', 'lips', 'teeth'),
106
+ img_dir='easyportrait_384/images/val',
107
+ ann_dir='easyportrait_384/annotations_fp/val',
108
+ pipeline=[
109
+ dict(type='LoadImageFromFile'),
110
+ dict(
111
+ type='MultiScaleFlipAug',
112
+ img_scale=(384, 384),
113
+ flip=False,
114
+ transforms=[
115
+ dict(
116
+ type='Normalize',
117
+ mean=[143.55267075, 132.96705975, 126.94924335],
118
+ std=[60.2625333, 60.32740275, 59.30988645],
119
+ to_rgb=True),
120
+ dict(type='ImageToTensor', keys=['img']),
121
+ dict(type='Collect', keys=['img'])
122
+ ])
123
+ ]),
124
+ test=dict(
125
+ type='EasyPortraitFPDataset',
126
+ data_root='/home/jovyan/datasets/wacv_24/',
127
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
128
+ 'right eye', 'lips', 'teeth'),
129
+ img_dir='easyportrait_384/images/test',
130
+ ann_dir='easyportrait_384/annotations_fp/test',
131
+ pipeline=[
132
+ dict(type='LoadImageFromFile'),
133
+ dict(
134
+ type='MultiScaleFlipAug',
135
+ img_scale=(384, 384),
136
+ flip=False,
137
+ transforms=[
138
+ dict(
139
+ type='Normalize',
140
+ mean=[143.55267075, 132.96705975, 126.94924335],
141
+ std=[60.2625333, 60.32740275, 59.30988645],
142
+ to_rgb=True),
143
+ dict(type='ImageToTensor', keys=['img']),
144
+ dict(type='Collect', keys=['img'])
145
+ ])
146
+ ]),
147
+ samples_per_gpu=32,
148
+ workers_per_gpu=8)
149
+ log_config = dict(
150
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
151
+ dist_params = dict(backend='nccl')
152
+ log_level = 'INFO'
153
+ load_from = None
154
+ resume_from = None
155
+ workflow = [('train', 1)]
156
+ cudnn_benchmark = True
157
+ optimizer = dict(
158
+ type='AdamW',
159
+ lr=6e-05,
160
+ betas=(0.9, 0.999),
161
+ weight_decay=0.01,
162
+ paramwise_cfg=dict(
163
+ custom_keys=dict(
164
+ pos_block=dict(decay_mult=0.0),
165
+ norm=dict(decay_mult=0.0),
166
+ head=dict(lr_mult=10.0))))
167
+ optimizer_config = dict()
168
+ lr_config = dict(
169
+ policy='poly',
170
+ warmup='linear',
171
+ warmup_iters=1500,
172
+ warmup_ratio=1e-06,
173
+ power=1.0,
174
+ min_lr=0.0,
175
+ by_epoch=False)
176
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
177
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
178
+ checkpoint_config = dict(by_epoch=True, interval=100)
179
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
180
+ work_dir = 'work_dirs/petrova/fpn-fp'
181
+ gpu_ids = [0]
182
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/fpn-ps/fpn-ps.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='open-mmlab://resnet50_v1c',
5
+ backbone=dict(
6
+ type='ResNetV1c',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ dilations=(1, 1, 1, 1),
11
+ strides=(1, 2, 2, 2),
12
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
13
+ norm_eval=False,
14
+ style='pytorch',
15
+ contract_dilation=True),
16
+ neck=dict(
17
+ type='FPN',
18
+ in_channels=[256, 512, 1024, 2048],
19
+ out_channels=256,
20
+ num_outs=4),
21
+ decode_head=dict(
22
+ type='FPNHead',
23
+ in_channels=[256, 256, 256, 256],
24
+ in_index=[0, 1, 2, 3],
25
+ feature_strides=[4, 8, 16, 32],
26
+ channels=128,
27
+ dropout_ratio=0.1,
28
+ num_classes=2,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ align_corners=False,
31
+ loss_decode=dict(
32
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
33
+ train_cfg=dict(),
34
+ test_cfg=dict(mode='whole'))
35
+ dataset_type = 'EasyPortraitPSDataset'
36
+ data_root = '/home/jovyan/datasets/wacv_24/'
37
+ img_norm_cfg = dict(
38
+ mean=[143.55267075, 132.96705975, 126.94924335],
39
+ std=[60.2625333, 60.32740275, 59.30988645],
40
+ to_rgb=True)
41
+ train_pipeline = [
42
+ dict(type='LoadImageFromFile'),
43
+ dict(type='LoadAnnotations'),
44
+ dict(type='RandomFlip', prob=0.0),
45
+ dict(
46
+ type='PhotoMetricDistortion',
47
+ brightness_delta=16,
48
+ contrast_range=(0.5, 1.0),
49
+ saturation_range=(0.5, 1.0),
50
+ hue_delta=5),
51
+ dict(
52
+ type='Normalize',
53
+ mean=[143.55267075, 132.96705975, 126.94924335],
54
+ std=[60.2625333, 60.32740275, 59.30988645],
55
+ to_rgb=True),
56
+ dict(type='DefaultFormatBundle'),
57
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
58
+ ]
59
+ test_pipeline = [
60
+ dict(type='LoadImageFromFile'),
61
+ dict(
62
+ type='MultiScaleFlipAug',
63
+ img_scale=(384, 384),
64
+ flip=False,
65
+ transforms=[
66
+ dict(
67
+ type='Normalize',
68
+ mean=[143.55267075, 132.96705975, 126.94924335],
69
+ std=[60.2625333, 60.32740275, 59.30988645],
70
+ to_rgb=True),
71
+ dict(type='ImageToTensor', keys=['img']),
72
+ dict(type='Collect', keys=['img'])
73
+ ])
74
+ ]
75
+ data = dict(
76
+ train=dict(
77
+ type='EasyPortraitPSDataset',
78
+ data_root='/home/jovyan/datasets/wacv_24/',
79
+ classes=('background', 'person'),
80
+ img_dir='easyportrait_384/images/train',
81
+ ann_dir='easyportrait_384/annotations_ps/train',
82
+ pipeline=[
83
+ dict(type='LoadImageFromFile'),
84
+ dict(type='LoadAnnotations'),
85
+ dict(type='RandomFlip', prob=0.0),
86
+ dict(
87
+ type='PhotoMetricDistortion',
88
+ brightness_delta=16,
89
+ contrast_range=(0.5, 1.0),
90
+ saturation_range=(0.5, 1.0),
91
+ hue_delta=5),
92
+ dict(
93
+ type='Normalize',
94
+ mean=[143.55267075, 132.96705975, 126.94924335],
95
+ std=[60.2625333, 60.32740275, 59.30988645],
96
+ to_rgb=True),
97
+ dict(type='DefaultFormatBundle'),
98
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
99
+ ]),
100
+ val=dict(
101
+ type='EasyPortraitPSDataset',
102
+ data_root='/home/jovyan/datasets/wacv_24/',
103
+ classes=('background', 'person'),
104
+ img_dir='easyportrait_384/images/val',
105
+ ann_dir='easyportrait_384/annotations_ps/val',
106
+ pipeline=[
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='MultiScaleFlipAug',
110
+ img_scale=(384, 384),
111
+ flip=False,
112
+ transforms=[
113
+ dict(
114
+ type='Normalize',
115
+ mean=[143.55267075, 132.96705975, 126.94924335],
116
+ std=[60.2625333, 60.32740275, 59.30988645],
117
+ to_rgb=True),
118
+ dict(type='ImageToTensor', keys=['img']),
119
+ dict(type='Collect', keys=['img'])
120
+ ])
121
+ ]),
122
+ test=dict(
123
+ type='EasyPortraitPSDataset',
124
+ data_root='/home/jovyan/datasets/wacv_24/',
125
+ classes=('background', 'person'),
126
+ img_dir='easyportrait_384/images/test',
127
+ ann_dir='easyportrait_384/annotations_ps/test',
128
+ pipeline=[
129
+ dict(type='LoadImageFromFile'),
130
+ dict(
131
+ type='MultiScaleFlipAug',
132
+ img_scale=(384, 384),
133
+ flip=False,
134
+ transforms=[
135
+ dict(
136
+ type='Normalize',
137
+ mean=[143.55267075, 132.96705975, 126.94924335],
138
+ std=[60.2625333, 60.32740275, 59.30988645],
139
+ to_rgb=True),
140
+ dict(type='ImageToTensor', keys=['img']),
141
+ dict(type='Collect', keys=['img'])
142
+ ])
143
+ ]),
144
+ samples_per_gpu=32,
145
+ workers_per_gpu=8)
146
+ log_config = dict(
147
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
148
+ dist_params = dict(backend='nccl')
149
+ log_level = 'INFO'
150
+ load_from = None
151
+ resume_from = None
152
+ workflow = [('train', 1)]
153
+ cudnn_benchmark = True
154
+ optimizer = dict(
155
+ type='AdamW',
156
+ lr=6e-05,
157
+ betas=(0.9, 0.999),
158
+ weight_decay=0.01,
159
+ paramwise_cfg=dict(
160
+ custom_keys=dict(
161
+ pos_block=dict(decay_mult=0.0),
162
+ norm=dict(decay_mult=0.0),
163
+ head=dict(lr_mult=10.0))))
164
+ optimizer_config = dict()
165
+ lr_config = dict(
166
+ policy='poly',
167
+ warmup='linear',
168
+ warmup_iters=1500,
169
+ warmup_ratio=1e-06,
170
+ power=1.0,
171
+ min_lr=0.0,
172
+ by_epoch=False)
173
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
174
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
175
+ checkpoint_config = dict(by_epoch=True, interval=100)
176
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
177
+ work_dir = 'work_dirs/petrova/fpn-ps'
178
+ gpu_ids = [0]
179
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/segformer-fp/segformer-fp.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained=
5
+ 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
6
+ backbone=dict(
7
+ type='MixVisionTransformer',
8
+ in_channels=3,
9
+ embed_dims=32,
10
+ num_stages=4,
11
+ num_layers=[2, 2, 2, 2],
12
+ num_heads=[1, 2, 5, 8],
13
+ patch_sizes=[7, 3, 3, 3],
14
+ sr_ratios=[8, 4, 2, 1],
15
+ out_indices=(0, 1, 2, 3),
16
+ mlp_ratio=4,
17
+ qkv_bias=True,
18
+ drop_rate=0.0,
19
+ attn_drop_rate=0.0,
20
+ drop_path_rate=0.1),
21
+ decode_head=dict(
22
+ type='SegformerHead',
23
+ in_channels=[32, 64, 160, 256],
24
+ in_index=[0, 1, 2, 3],
25
+ channels=256,
26
+ dropout_ratio=0.1,
27
+ num_classes=8,
28
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ train_cfg=dict(),
33
+ test_cfg=dict(mode='whole'))
34
+ dataset_type = 'EasyPortraitFPDataset'
35
+ data_root = '/home/jovyan/datasets/wacv_24/'
36
+ img_norm_cfg = dict(
37
+ mean=[143.55267075, 132.96705975, 126.94924335],
38
+ std=[60.2625333, 60.32740275, 59.30988645],
39
+ to_rgb=True)
40
+ train_pipeline = [
41
+ dict(type='LoadImageFromFile'),
42
+ dict(type='LoadAnnotations'),
43
+ dict(type='RandomFlip', prob=0.0),
44
+ dict(
45
+ type='PhotoMetricDistortion',
46
+ brightness_delta=16,
47
+ contrast_range=(0.5, 1.0),
48
+ saturation_range=(0.5, 1.0),
49
+ hue_delta=5),
50
+ dict(
51
+ type='Normalize',
52
+ mean=[143.55267075, 132.96705975, 126.94924335],
53
+ std=[60.2625333, 60.32740275, 59.30988645],
54
+ to_rgb=True),
55
+ dict(type='DefaultFormatBundle'),
56
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
57
+ ]
58
+ test_pipeline = [
59
+ dict(type='LoadImageFromFile'),
60
+ dict(
61
+ type='MultiScaleFlipAug',
62
+ img_scale=(384, 384),
63
+ flip=False,
64
+ transforms=[
65
+ dict(
66
+ type='Normalize',
67
+ mean=[143.55267075, 132.96705975, 126.94924335],
68
+ std=[60.2625333, 60.32740275, 59.30988645],
69
+ to_rgb=True),
70
+ dict(type='ImageToTensor', keys=['img']),
71
+ dict(type='Collect', keys=['img'])
72
+ ])
73
+ ]
74
+ data = dict(
75
+ train=dict(
76
+ type='EasyPortraitFPDataset',
77
+ data_root='/home/jovyan/datasets/wacv_24/',
78
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
79
+ 'right eye', 'lips', 'teeth'),
80
+ img_dir='easyportrait_384/images/train',
81
+ ann_dir='easyportrait_384/annotations_fp/train',
82
+ pipeline=[
83
+ dict(type='LoadImageFromFile'),
84
+ dict(type='LoadAnnotations'),
85
+ dict(type='RandomFlip', prob=0.0),
86
+ dict(
87
+ type='PhotoMetricDistortion',
88
+ brightness_delta=16,
89
+ contrast_range=(0.5, 1.0),
90
+ saturation_range=(0.5, 1.0),
91
+ hue_delta=5),
92
+ dict(
93
+ type='Normalize',
94
+ mean=[143.55267075, 132.96705975, 126.94924335],
95
+ std=[60.2625333, 60.32740275, 59.30988645],
96
+ to_rgb=True),
97
+ dict(type='DefaultFormatBundle'),
98
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
99
+ ]),
100
+ val=dict(
101
+ type='EasyPortraitFPDataset',
102
+ data_root='/home/jovyan/datasets/wacv_24/',
103
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
104
+ 'right eye', 'lips', 'teeth'),
105
+ img_dir='easyportrait_384/images/val',
106
+ ann_dir='easyportrait_384/annotations_fp/val',
107
+ pipeline=[
108
+ dict(type='LoadImageFromFile'),
109
+ dict(
110
+ type='MultiScaleFlipAug',
111
+ img_scale=(384, 384),
112
+ flip=False,
113
+ transforms=[
114
+ dict(
115
+ type='Normalize',
116
+ mean=[143.55267075, 132.96705975, 126.94924335],
117
+ std=[60.2625333, 60.32740275, 59.30988645],
118
+ to_rgb=True),
119
+ dict(type='ImageToTensor', keys=['img']),
120
+ dict(type='Collect', keys=['img'])
121
+ ])
122
+ ]),
123
+ test=dict(
124
+ type='EasyPortraitFPDataset',
125
+ data_root='/home/jovyan/datasets/wacv_24/',
126
+ classes=('background', 'skin', 'left brow', 'right brow', 'left eye',
127
+ 'right eye', 'lips', 'teeth'),
128
+ img_dir='easyportrait_384/images/test',
129
+ ann_dir='easyportrait_384/annotations_fp/test',
130
+ pipeline=[
131
+ dict(type='LoadImageFromFile'),
132
+ dict(
133
+ type='MultiScaleFlipAug',
134
+ img_scale=(384, 384),
135
+ flip=False,
136
+ transforms=[
137
+ dict(
138
+ type='Normalize',
139
+ mean=[143.55267075, 132.96705975, 126.94924335],
140
+ std=[60.2625333, 60.32740275, 59.30988645],
141
+ to_rgb=True),
142
+ dict(type='ImageToTensor', keys=['img']),
143
+ dict(type='Collect', keys=['img'])
144
+ ])
145
+ ]),
146
+ samples_per_gpu=32,
147
+ workers_per_gpu=8)
148
+ log_config = dict(
149
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
150
+ dist_params = dict(backend='nccl')
151
+ log_level = 'INFO'
152
+ load_from = None
153
+ resume_from = None
154
+ workflow = [('train', 1)]
155
+ cudnn_benchmark = True
156
+ optimizer = dict(
157
+ type='AdamW',
158
+ lr=6e-05,
159
+ betas=(0.9, 0.999),
160
+ weight_decay=0.01,
161
+ paramwise_cfg=dict(
162
+ custom_keys=dict(
163
+ pos_block=dict(decay_mult=0.0),
164
+ norm=dict(decay_mult=0.0),
165
+ head=dict(lr_mult=10.0))))
166
+ optimizer_config = dict()
167
+ lr_config = dict(
168
+ policy='poly',
169
+ warmup='linear',
170
+ warmup_iters=1500,
171
+ warmup_ratio=1e-06,
172
+ power=1.0,
173
+ min_lr=0.0,
174
+ by_epoch=False)
175
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
176
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
177
+ checkpoint_config = dict(by_epoch=True, interval=100)
178
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
179
+ checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'
180
+ work_dir = 'work_dirs/petrova/segformer-fp'
181
+ gpu_ids = [0]
182
+ auto_resume = False
data_utils/easyportrait/local_configs/easyportrait_experiments_v2/segformer-ps/segformer-ps.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained=
5
+ 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
6
+ backbone=dict(
7
+ type='MixVisionTransformer',
8
+ in_channels=3,
9
+ embed_dims=32,
10
+ num_stages=4,
11
+ num_layers=[2, 2, 2, 2],
12
+ num_heads=[1, 2, 5, 8],
13
+ patch_sizes=[7, 3, 3, 3],
14
+ sr_ratios=[8, 4, 2, 1],
15
+ out_indices=(0, 1, 2, 3),
16
+ mlp_ratio=4,
17
+ qkv_bias=True,
18
+ drop_rate=0.0,
19
+ attn_drop_rate=0.0,
20
+ drop_path_rate=0.1),
21
+ decode_head=dict(
22
+ type='SegformerHead',
23
+ in_channels=[32, 64, 160, 256],
24
+ in_index=[0, 1, 2, 3],
25
+ channels=256,
26
+ dropout_ratio=0.1,
27
+ num_classes=2,
28
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ train_cfg=dict(),
33
+ test_cfg=dict(mode='whole'))
34
+ dataset_type = 'EasyPortraitPSDataset'
35
+ data_root = '/home/jovyan/datasets/wacv_24/'
36
+ img_norm_cfg = dict(
37
+ mean=[143.55267075, 132.96705975, 126.94924335],
38
+ std=[60.2625333, 60.32740275, 59.30988645],
39
+ to_rgb=True)
40
+ train_pipeline = [
41
+ dict(type='LoadImageFromFile'),
42
+ dict(type='LoadAnnotations'),
43
+ dict(type='RandomFlip', prob=0.0),
44
+ dict(
45
+ type='PhotoMetricDistortion',
46
+ brightness_delta=16,
47
+ contrast_range=(0.5, 1.0),
48
+ saturation_range=(0.5, 1.0),
49
+ hue_delta=5),
50
+ dict(
51
+ type='Normalize',
52
+ mean=[143.55267075, 132.96705975, 126.94924335],
53
+ std=[60.2625333, 60.32740275, 59.30988645],
54
+ to_rgb=True),
55
+ dict(type='DefaultFormatBundle'),
56
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
57
+ ]
58
+ test_pipeline = [
59
+ dict(type='LoadImageFromFile'),
60
+ dict(
61
+ type='MultiScaleFlipAug',
62
+ img_scale=(384, 384),
63
+ flip=False,
64
+ transforms=[
65
+ dict(
66
+ type='Normalize',
67
+ mean=[143.55267075, 132.96705975, 126.94924335],
68
+ std=[60.2625333, 60.32740275, 59.30988645],
69
+ to_rgb=True),
70
+ dict(type='ImageToTensor', keys=['img']),
71
+ dict(type='Collect', keys=['img'])
72
+ ])
73
+ ]
74
+ data = dict(
75
+ train=dict(
76
+ type='EasyPortraitPSDataset',
77
+ data_root='/home/jovyan/datasets/wacv_24/',
78
+ classes=('background', 'person'),
79
+ img_dir='easyportrait_384/images/train',
80
+ ann_dir='easyportrait_384/annotations_ps/train',
81
+ pipeline=[
82
+ dict(type='LoadImageFromFile'),
83
+ dict(type='LoadAnnotations'),
84
+ dict(type='RandomFlip', prob=0.0),
85
+ dict(
86
+ type='PhotoMetricDistortion',
87
+ brightness_delta=16,
88
+ contrast_range=(0.5, 1.0),
89
+ saturation_range=(0.5, 1.0),
90
+ hue_delta=5),
91
+ dict(
92
+ type='Normalize',
93
+ mean=[143.55267075, 132.96705975, 126.94924335],
94
+ std=[60.2625333, 60.32740275, 59.30988645],
95
+ to_rgb=True),
96
+ dict(type='DefaultFormatBundle'),
97
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
98
+ ]),
99
+ val=dict(
100
+ type='EasyPortraitPSDataset',
101
+ data_root='/home/jovyan/datasets/wacv_24/',
102
+ classes=('background', 'person'),
103
+ img_dir='easyportrait_384/images/val',
104
+ ann_dir='easyportrait_384/annotations_ps/val',
105
+ pipeline=[
106
+ dict(type='LoadImageFromFile'),
107
+ dict(
108
+ type='MultiScaleFlipAug',
109
+ img_scale=(384, 384),
110
+ flip=False,
111
+ transforms=[
112
+ dict(
113
+ type='Normalize',
114
+ mean=[143.55267075, 132.96705975, 126.94924335],
115
+ std=[60.2625333, 60.32740275, 59.30988645],
116
+ to_rgb=True),
117
+ dict(type='ImageToTensor', keys=['img']),
118
+ dict(type='Collect', keys=['img'])
119
+ ])
120
+ ]),
121
+ test=dict(
122
+ type='EasyPortraitPSDataset',
123
+ data_root='/home/jovyan/datasets/wacv_24/',
124
+ classes=('background', 'person'),
125
+ img_dir='easyportrait_384/images/test',
126
+ ann_dir='easyportrait_384/annotations_ps/test',
127
+ pipeline=[
128
+ dict(type='LoadImageFromFile'),
129
+ dict(
130
+ type='MultiScaleFlipAug',
131
+ img_scale=(384, 384),
132
+ flip=False,
133
+ transforms=[
134
+ dict(
135
+ type='Normalize',
136
+ mean=[143.55267075, 132.96705975, 126.94924335],
137
+ std=[60.2625333, 60.32740275, 59.30988645],
138
+ to_rgb=True),
139
+ dict(type='ImageToTensor', keys=['img']),
140
+ dict(type='Collect', keys=['img'])
141
+ ])
142
+ ]),
143
+ samples_per_gpu=32,
144
+ workers_per_gpu=8)
145
+ log_config = dict(
146
+ interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
147
+ dist_params = dict(backend='nccl')
148
+ log_level = 'INFO'
149
+ load_from = None
150
+ resume_from = None
151
+ workflow = [('train', 1)]
152
+ cudnn_benchmark = True
153
+ optimizer = dict(
154
+ type='AdamW',
155
+ lr=6e-05,
156
+ betas=(0.9, 0.999),
157
+ weight_decay=0.01,
158
+ paramwise_cfg=dict(
159
+ custom_keys=dict(
160
+ pos_block=dict(decay_mult=0.0),
161
+ norm=dict(decay_mult=0.0),
162
+ head=dict(lr_mult=10.0))))
163
+ optimizer_config = dict()
164
+ lr_config = dict(
165
+ policy='poly',
166
+ warmup='linear',
167
+ warmup_iters=1500,
168
+ warmup_ratio=1e-06,
169
+ power=1.0,
170
+ min_lr=0.0,
171
+ by_epoch=False)
172
+ default_hooks = dict(stop=dict(type='EarlyStoppingHook', monitor='mIoU'))
173
+ runner = dict(type='EpochBasedRunner', max_epochs=100)
174
+ checkpoint_config = dict(by_epoch=True, interval=100)
175
+ evaluation = dict(interval=1, metric='mIoU', save_best='mIoU')
176
+ checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'
177
+ work_dir = 'work_dirs/petrova/segformer-ps'
178
+ gpu_ids = [0]
179
+ auto_resume = False
data_utils/easyportrait/mmseg/.mim/configs ADDED
File without changes
data_utils/easyportrait/mmseg/.mim/tools ADDED
File without changes
data_utils/easyportrait/mmseg/__init__.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import warnings
3
+
4
+ import mmcv
5
+ from packaging.version import parse
6
+
7
+ from .version import __version__, version_info
8
+
9
+ MMCV_MIN = '1.3.13'
10
+ MMCV_MAX = '1.8.0'
11
+
12
+
13
+ def digit_version(version_str: str, length: int = 4):
14
+ """Convert a version string into a tuple of integers.
15
+
16
+ This method is usually used for comparing two versions. For pre-release
17
+ versions: alpha < beta < rc.
18
+
19
+ Args:
20
+ version_str (str): The version string.
21
+ length (int): The maximum number of version levels. Default: 4.
22
+
23
+ Returns:
24
+ tuple[int]: The version info in digits (integers).
25
+ """
26
+ version = parse(version_str)
27
+ assert version.release, f'failed to parse version {version_str}'
28
+ release = list(version.release)
29
+ release = release[:length]
30
+ if len(release) < length:
31
+ release = release + [0] * (length - len(release))
32
+ if version.is_prerelease:
33
+ mapping = {'a': -3, 'b': -2, 'rc': -1}
34
+ val = -4
35
+ # version.pre can be None
36
+ if version.pre:
37
+ if version.pre[0] not in mapping:
38
+ warnings.warn(f'unknown prerelease version {version.pre[0]}, '
39
+ 'version checking may go wrong')
40
+ else:
41
+ val = mapping[version.pre[0]]
42
+ release.extend([val, version.pre[-1]])
43
+ else:
44
+ release.extend([val, 0])
45
+
46
+ elif version.is_postrelease:
47
+ release.extend([1, version.post])
48
+ else:
49
+ release.extend([0, 0])
50
+ return tuple(release)
51
+
52
+
53
+ mmcv_min_version = digit_version(MMCV_MIN)
54
+ mmcv_max_version = digit_version(MMCV_MAX)
55
+ mmcv_version = digit_version(mmcv.__version__)
56
+
57
+
58
+ assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
59
+ f'MMCV=={mmcv.__version__} is used but incompatible. ' \
60
+ f'Please install mmcv>={mmcv_min_version}, <{mmcv_max_version}.'
61
+
62
+ __all__ = ['__version__', 'version_info', 'digit_version']
data_utils/easyportrait/mmseg/apis/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from .inference import inference_segmentor, init_segmentor, show_result_pyplot
3
+ from .test import multi_gpu_test, single_gpu_test
4
+ from .train import (get_root_logger, init_random_seed, set_random_seed,
5
+ train_segmentor)
6
+
7
+ __all__ = [
8
+ 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
9
+ 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
10
+ 'show_result_pyplot', 'init_random_seed'
11
+ ]
data_utils/easyportrait/mmseg/apis/inference.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import matplotlib.pyplot as plt
3
+ import mmcv
4
+ import torch
5
+ from mmcv.parallel import collate, scatter
6
+ from mmcv.runner import load_checkpoint
7
+
8
+ from mmseg.datasets.pipelines import Compose
9
+ from mmseg.models import build_segmentor
10
+
11
+
12
+ def init_segmentor(config, checkpoint=None, device='cuda:0'):
13
+ """Initialize a segmentor from config file.
14
+
15
+ Args:
16
+ config (str or :obj:`mmcv.Config`): Config file path or the config
17
+ object.
18
+ checkpoint (str, optional): Checkpoint path. If left as None, the model
19
+ will not load any weights.
20
+ device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
21
+ Use 'cpu' for loading model on CPU.
22
+ Returns:
23
+ nn.Module: The constructed segmentor.
24
+ """
25
+ if isinstance(config, str):
26
+ config = mmcv.Config.fromfile(config)
27
+ elif not isinstance(config, mmcv.Config):
28
+ raise TypeError('config must be a filename or Config object, '
29
+ 'but got {}'.format(type(config)))
30
+ config.model.pretrained = None
31
+ config.model.train_cfg = None
32
+ model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
33
+ if checkpoint is not None:
34
+ checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
35
+ model.CLASSES = checkpoint['meta']['CLASSES']
36
+ model.PALETTE = checkpoint['meta']['PALETTE']
37
+ model.cfg = config # save the config in the model for convenience
38
+ model.to(device)
39
+ model.eval()
40
+ return model
41
+
42
+
43
+ class LoadImage:
44
+ """A simple pipeline to load image."""
45
+
46
+ def __call__(self, results):
47
+ """Call function to load images into results.
48
+
49
+ Args:
50
+ results (dict): A result dict contains the file name
51
+ of the image to be read.
52
+
53
+ Returns:
54
+ dict: ``results`` will be returned containing loaded image.
55
+ """
56
+
57
+ if isinstance(results['img'], str):
58
+ results['filename'] = results['img']
59
+ results['ori_filename'] = results['img']
60
+ else:
61
+ results['filename'] = None
62
+ results['ori_filename'] = None
63
+ img = mmcv.imread(results['img'])
64
+ results['img'] = img
65
+ results['img_shape'] = img.shape
66
+ results['ori_shape'] = img.shape
67
+ return results
68
+
69
+
70
+ def inference_segmentor(model, imgs):
71
+ """Inference image(s) with the segmentor.
72
+
73
+ Args:
74
+ model (nn.Module): The loaded segmentor.
75
+ imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
76
+ images.
77
+
78
+ Returns:
79
+ (list[Tensor]): The segmentation result.
80
+ """
81
+ cfg = model.cfg
82
+ device = next(model.parameters()).device # model device
83
+ # build the data pipeline
84
+ test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
85
+ test_pipeline = Compose(test_pipeline)
86
+ # prepare data
87
+ data = []
88
+ imgs = imgs if isinstance(imgs, list) else [imgs]
89
+ for img in imgs:
90
+ img_data = dict(img=img)
91
+ img_data = test_pipeline(img_data)
92
+ data.append(img_data)
93
+ data = collate(data, samples_per_gpu=len(imgs))
94
+ if next(model.parameters()).is_cuda:
95
+ # scatter to specified GPU
96
+ data = scatter(data, [device])[0]
97
+ else:
98
+ data['img_metas'] = [i.data[0] for i in data['img_metas']]
99
+
100
+ # forward the model
101
+ with torch.no_grad():
102
+ result = model(return_loss=False, rescale=True, **data)
103
+ return result
104
+
105
+
106
+ def show_result_pyplot(model,
107
+ img,
108
+ result,
109
+ palette=None,
110
+ fig_size=(15, 10),
111
+ opacity=0.5,
112
+ title='',
113
+ block=True,
114
+ out_file=None):
115
+ """Visualize the segmentation results on the image.
116
+
117
+ Args:
118
+ model (nn.Module): The loaded segmentor.
119
+ img (str or np.ndarray): Image filename or loaded image.
120
+ result (list): The segmentation result.
121
+ palette (list[list[int]]] | None): The palette of segmentation
122
+ map. If None is given, random palette will be generated.
123
+ Default: None
124
+ fig_size (tuple): Figure size of the pyplot figure.
125
+ opacity(float): Opacity of painted segmentation map.
126
+ Default 0.5.
127
+ Must be in (0, 1] range.
128
+ title (str): The title of pyplot figure.
129
+ Default is ''.
130
+ block (bool): Whether to block the pyplot figure.
131
+ Default is True.
132
+ out_file (str or None): The path to write the image.
133
+ Default: None.
134
+ """
135
+ if hasattr(model, 'module'):
136
+ model = model.module
137
+ img = model.show_result(
138
+ img, result, palette=palette, show=False, opacity=opacity)
139
+ plt.figure(figsize=fig_size)
140
+ plt.imshow(mmcv.bgr2rgb(img))
141
+ plt.title(title)
142
+ plt.tight_layout()
143
+ plt.show(block=block)
144
+ if out_file is not None:
145
+ mmcv.imwrite(img, out_file)
data_utils/easyportrait/mmseg/apis/test.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import os.path as osp
3
+ import tempfile
4
+ import warnings
5
+
6
+ import mmcv
7
+ import numpy as np
8
+ import torch
9
+ from mmcv.engine import collect_results_cpu, collect_results_gpu
10
+ from mmcv.image import tensor2imgs
11
+ from mmcv.runner import get_dist_info
12
+
13
+
14
+ def np2tmp(array, temp_file_name=None, tmpdir=None):
15
+ """Save ndarray to local numpy file.
16
+
17
+ Args:
18
+ array (ndarray): Ndarray to save.
19
+ temp_file_name (str): Numpy file name. If 'temp_file_name=None', this
20
+ function will generate a file name with tempfile.NamedTemporaryFile
21
+ to save ndarray. Default: None.
22
+ tmpdir (str): Temporary directory to save Ndarray files. Default: None.
23
+ Returns:
24
+ str: The numpy file name.
25
+ """
26
+
27
+ if temp_file_name is None:
28
+ temp_file_name = tempfile.NamedTemporaryFile(
29
+ suffix='.npy', delete=False, dir=tmpdir).name
30
+ np.save(temp_file_name, array)
31
+ return temp_file_name
32
+
33
+
34
+ def single_gpu_test(model,
35
+ data_loader,
36
+ show=False,
37
+ out_dir=None,
38
+ efficient_test=False,
39
+ opacity=0.5,
40
+ pre_eval=False,
41
+ format_only=False,
42
+ format_args={}):
43
+ """Test with single GPU by progressive mode.
44
+
45
+ Args:
46
+ model (nn.Module): Model to be tested.
47
+ data_loader (utils.data.Dataloader): Pytorch data loader.
48
+ show (bool): Whether show results during inference. Default: False.
49
+ out_dir (str, optional): If specified, the results will be dumped into
50
+ the directory to save output results.
51
+ efficient_test (bool): Whether save the results as local numpy files to
52
+ save CPU memory during evaluation. Mutually exclusive with
53
+ pre_eval and format_results. Default: False.
54
+ opacity(float): Opacity of painted segmentation map.
55
+ Default 0.5.
56
+ Must be in (0, 1] range.
57
+ pre_eval (bool): Use dataset.pre_eval() function to generate
58
+ pre_results for metric evaluation. Mutually exclusive with
59
+ efficient_test and format_results. Default: False.
60
+ format_only (bool): Only format result for results commit.
61
+ Mutually exclusive with pre_eval and efficient_test.
62
+ Default: False.
63
+ format_args (dict): The args for format_results. Default: {}.
64
+ Returns:
65
+ list: list of evaluation pre-results or list of save file names.
66
+ """
67
+ if efficient_test:
68
+ warnings.warn(
69
+ 'DeprecationWarning: ``efficient_test`` will be deprecated, the '
70
+ 'evaluation is CPU memory friendly with pre_eval=True')
71
+ mmcv.mkdir_or_exist('.efficient_test')
72
+ # when none of them is set true, return segmentation results as
73
+ # a list of np.array.
74
+ assert [efficient_test, pre_eval, format_only].count(True) <= 1, \
75
+ '``efficient_test``, ``pre_eval`` and ``format_only`` are mutually ' \
76
+ 'exclusive, only one of them could be true .'
77
+
78
+ model.eval()
79
+ results = []
80
+ dataset = data_loader.dataset
81
+ prog_bar = mmcv.ProgressBar(len(dataset))
82
+ # The pipeline about how the data_loader retrieval samples from dataset:
83
+ # sampler -> batch_sampler -> indices
84
+ # The indices are passed to dataset_fetcher to get data from dataset.
85
+ # data_fetcher -> collate_fn(dataset[index]) -> data_sample
86
+ # we use batch_sampler to get correct data idx
87
+ loader_indices = data_loader.batch_sampler
88
+
89
+ for batch_indices, data in zip(loader_indices, data_loader):
90
+ with torch.no_grad():
91
+ result = model(return_loss=False, **data)
92
+
93
+ if show or out_dir:
94
+ img_tensor = data['img'][0]
95
+ img_metas = data['img_metas'][0].data[0]
96
+ imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
97
+ assert len(imgs) == len(img_metas)
98
+
99
+ for img, img_meta in zip(imgs, img_metas):
100
+ h, w, _ = img_meta['img_shape']
101
+ img_show = img[:h, :w, :]
102
+
103
+ ori_h, ori_w = img_meta['ori_shape'][:-1]
104
+ img_show = mmcv.imresize(img_show, (ori_w, ori_h))
105
+
106
+ if out_dir:
107
+ out_file = osp.join(out_dir, img_meta['ori_filename'])
108
+ else:
109
+ out_file = None
110
+
111
+ model.module.show_result(
112
+ img_show,
113
+ result,
114
+ palette=dataset.PALETTE,
115
+ show=show,
116
+ out_file=out_file,
117
+ opacity=opacity)
118
+
119
+ if efficient_test:
120
+ result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
121
+
122
+ if format_only:
123
+ result = dataset.format_results(
124
+ result, indices=batch_indices, **format_args)
125
+ if pre_eval:
126
+ # TODO: adapt samples_per_gpu > 1.
127
+ # only samples_per_gpu=1 valid now
128
+ result = dataset.pre_eval(result, indices=batch_indices)
129
+ results.extend(result)
130
+ else:
131
+ results.extend(result)
132
+
133
+ batch_size = len(result)
134
+ for _ in range(batch_size):
135
+ prog_bar.update()
136
+
137
+ return results
138
+
139
+
140
+ def multi_gpu_test(model,
141
+ data_loader,
142
+ tmpdir=None,
143
+ gpu_collect=False,
144
+ efficient_test=False,
145
+ pre_eval=False,
146
+ format_only=False,
147
+ format_args={}):
148
+ """Test model with multiple gpus by progressive mode.
149
+
150
+ This method tests model with multiple gpus and collects the results
151
+ under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
152
+ it encodes results to gpu tensors and use gpu communication for results
153
+ collection. On cpu mode it saves the results on different gpus to 'tmpdir'
154
+ and collects them by the rank 0 worker.
155
+
156
+ Args:
157
+ model (nn.Module): Model to be tested.
158
+ data_loader (utils.data.Dataloader): Pytorch data loader.
159
+ tmpdir (str): Path of directory to save the temporary results from
160
+ different gpus under cpu mode. The same path is used for efficient
161
+ test. Default: None.
162
+ gpu_collect (bool): Option to use either gpu or cpu to collect results.
163
+ Default: False.
164
+ efficient_test (bool): Whether save the results as local numpy files to
165
+ save CPU memory during evaluation. Mutually exclusive with
166
+ pre_eval and format_results. Default: False.
167
+ pre_eval (bool): Use dataset.pre_eval() function to generate
168
+ pre_results for metric evaluation. Mutually exclusive with
169
+ efficient_test and format_results. Default: False.
170
+ format_only (bool): Only format result for results commit.
171
+ Mutually exclusive with pre_eval and efficient_test.
172
+ Default: False.
173
+ format_args (dict): The args for format_results. Default: {}.
174
+
175
+ Returns:
176
+ list: list of evaluation pre-results or list of save file names.
177
+ """
178
+ if efficient_test:
179
+ warnings.warn(
180
+ 'DeprecationWarning: ``efficient_test`` will be deprecated, the '
181
+ 'evaluation is CPU memory friendly with pre_eval=True')
182
+ mmcv.mkdir_or_exist('.efficient_test')
183
+ # when none of them is set true, return segmentation results as
184
+ # a list of np.array.
185
+ assert [efficient_test, pre_eval, format_only].count(True) <= 1, \
186
+ '``efficient_test``, ``pre_eval`` and ``format_only`` are mutually ' \
187
+ 'exclusive, only one of them could be true .'
188
+
189
+ model.eval()
190
+ results = []
191
+ dataset = data_loader.dataset
192
+ # The pipeline about how the data_loader retrieval samples from dataset:
193
+ # sampler -> batch_sampler -> indices
194
+ # The indices are passed to dataset_fetcher to get data from dataset.
195
+ # data_fetcher -> collate_fn(dataset[index]) -> data_sample
196
+ # we use batch_sampler to get correct data idx
197
+
198
+ # batch_sampler based on DistributedSampler, the indices only point to data
199
+ # samples of related machine.
200
+ loader_indices = data_loader.batch_sampler
201
+
202
+ rank, world_size = get_dist_info()
203
+ if rank == 0:
204
+ prog_bar = mmcv.ProgressBar(len(dataset))
205
+
206
+ for batch_indices, data in zip(loader_indices, data_loader):
207
+ with torch.no_grad():
208
+ result = model(return_loss=False, rescale=True, **data)
209
+
210
+ if efficient_test:
211
+ result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
212
+
213
+ if format_only:
214
+ result = dataset.format_results(
215
+ result, indices=batch_indices, **format_args)
216
+ if pre_eval:
217
+ # TODO: adapt samples_per_gpu > 1.
218
+ # only samples_per_gpu=1 valid now
219
+ result = dataset.pre_eval(result, indices=batch_indices)
220
+
221
+ results.extend(result)
222
+
223
+ if rank == 0:
224
+ batch_size = len(result) * world_size
225
+ for _ in range(batch_size):
226
+ prog_bar.update()
227
+
228
+ # collect results from all ranks
229
+ if gpu_collect:
230
+ results = collect_results_gpu(results, len(dataset))
231
+ else:
232
+ results = collect_results_cpu(results, len(dataset), tmpdir)
233
+ return results
data_utils/easyportrait/mmseg/apis/train.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import os
3
+ import random
4
+ import warnings
5
+
6
+ import mmcv
7
+ import numpy as np
8
+ import torch
9
+ import torch.distributed as dist
10
+ from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
11
+ build_runner, get_dist_info)
12
+ from mmcv.utils import build_from_cfg
13
+
14
+ from mmseg import digit_version
15
+ from mmseg.core import DistEvalHook, EvalHook, build_optimizer
16
+ from mmseg.datasets import build_dataloader, build_dataset
17
+ from mmseg.utils import (build_ddp, build_dp, find_latest_checkpoint,
18
+ get_root_logger)
19
+
20
+
21
+ def init_random_seed(seed=None, device='cuda'):
22
+ """Initialize random seed.
23
+
24
+ If the seed is not set, the seed will be automatically randomized,
25
+ and then broadcast to all processes to prevent some potential bugs.
26
+ Args:
27
+ seed (int, Optional): The seed. Default to None.
28
+ device (str): The device where the seed will be put on.
29
+ Default to 'cuda'.
30
+ Returns:
31
+ int: Seed to be used.
32
+ """
33
+ if seed is not None:
34
+ return seed
35
+
36
+ # Make sure all ranks share the same random seed to prevent
37
+ # some potential bugs. Please refer to
38
+ # https://github.com/open-mmlab/mmdetection/issues/6339
39
+ rank, world_size = get_dist_info()
40
+ seed = np.random.randint(2**31)
41
+ if world_size == 1:
42
+ return seed
43
+
44
+ if rank == 0:
45
+ random_num = torch.tensor(seed, dtype=torch.int32, device=device)
46
+ else:
47
+ random_num = torch.tensor(0, dtype=torch.int32, device=device)
48
+ dist.broadcast(random_num, src=0)
49
+ return random_num.item()
50
+
51
+
52
+ def set_random_seed(seed, deterministic=False):
53
+ """Set random seed.
54
+
55
+ Args:
56
+ seed (int): Seed to be used.
57
+ deterministic (bool): Whether to set the deterministic option for
58
+ CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
59
+ to True and `torch.backends.cudnn.benchmark` to False.
60
+ Default: False.
61
+ """
62
+ random.seed(seed)
63
+ np.random.seed(seed)
64
+ torch.manual_seed(seed)
65
+ torch.cuda.manual_seed_all(seed)
66
+ if deterministic:
67
+ torch.backends.cudnn.deterministic = True
68
+ torch.backends.cudnn.benchmark = False
69
+
70
+
71
+ def train_segmentor(model,
72
+ dataset,
73
+ cfg,
74
+ distributed=False,
75
+ validate=False,
76
+ timestamp=None,
77
+ meta=None):
78
+ """Launch segmentor training."""
79
+ logger = get_root_logger(cfg.log_level)
80
+
81
+ # prepare data loaders
82
+ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
83
+ # The default loader config
84
+ loader_cfg = dict(
85
+ # cfg.gpus will be ignored if distributed
86
+ num_gpus=len(cfg.gpu_ids),
87
+ dist=distributed,
88
+ seed=cfg.seed,
89
+ drop_last=True)
90
+ # The overall dataloader settings
91
+ loader_cfg.update({
92
+ k: v
93
+ for k, v in cfg.data.items() if k not in [
94
+ 'train', 'val', 'test', 'train_dataloader', 'val_dataloader',
95
+ 'test_dataloader'
96
+ ]
97
+ })
98
+
99
+ # The specific dataloader settings
100
+ train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})}
101
+ data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
102
+
103
+ # put model on devices
104
+ if distributed:
105
+ find_unused_parameters = cfg.get('find_unused_parameters', False)
106
+ # Sets the `find_unused_parameters` parameter in
107
+ # DDP wrapper
108
+ model = build_ddp(
109
+ model,
110
+ cfg.device,
111
+ device_ids=[int(os.environ['LOCAL_RANK'])],
112
+ broadcast_buffers=False,
113
+ find_unused_parameters=find_unused_parameters)
114
+ else:
115
+ if not torch.cuda.is_available():
116
+ assert digit_version(mmcv.__version__) >= digit_version('1.4.4'), \
117
+ 'Please use MMCV >= 1.4.4 for CPU training!'
118
+ model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
119
+
120
+ # build runner
121
+ optimizer = build_optimizer(model, cfg.optimizer)
122
+
123
+ if cfg.get('runner') is None:
124
+ cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
125
+ warnings.warn(
126
+ 'config is now expected to have a `runner` section, '
127
+ 'please set `runner` in your config.', UserWarning)
128
+
129
+ runner = build_runner(
130
+ cfg.runner,
131
+ default_args=dict(
132
+ model=model,
133
+ batch_processor=None,
134
+ optimizer=optimizer,
135
+ work_dir=cfg.work_dir,
136
+ logger=logger,
137
+ meta=meta))
138
+
139
+ # register hooks
140
+ runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
141
+ cfg.checkpoint_config, cfg.log_config,
142
+ cfg.get('momentum_config', None))
143
+ if distributed:
144
+ # when distributed training by epoch, using`DistSamplerSeedHook` to set
145
+ # the different seed to distributed sampler for each epoch, it will
146
+ # shuffle dataset at each epoch and avoid overfitting.
147
+ if isinstance(runner, EpochBasedRunner):
148
+ runner.register_hook(DistSamplerSeedHook())
149
+
150
+ # an ugly walkaround to make the .log and .log.json filenames the same
151
+ runner.timestamp = timestamp
152
+
153
+ # register eval hooks
154
+ if validate:
155
+ val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
156
+ # The specific dataloader settings
157
+ val_loader_cfg = {
158
+ **loader_cfg,
159
+ 'samples_per_gpu': 1,
160
+ 'shuffle': False, # Not shuffle by default
161
+ **cfg.data.get('val_dataloader', {}),
162
+ }
163
+ val_dataloader = build_dataloader(val_dataset, **val_loader_cfg)
164
+ eval_cfg = cfg.get('evaluation', {})
165
+ eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
166
+ eval_hook = DistEvalHook if distributed else EvalHook
167
+ # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
168
+ # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
169
+ runner.register_hook(
170
+ eval_hook(val_dataloader, **eval_cfg), priority='LOW')
171
+
172
+ # user-defined hooks
173
+ if cfg.get('custom_hooks', None):
174
+ custom_hooks = cfg.custom_hooks
175
+ assert isinstance(custom_hooks, list), \
176
+ f'custom_hooks expect list type, but got {type(custom_hooks)}'
177
+ for hook_cfg in cfg.custom_hooks:
178
+ assert isinstance(hook_cfg, dict), \
179
+ 'Each item in custom_hooks expects dict type, but got ' \
180
+ f'{type(hook_cfg)}'
181
+ hook_cfg = hook_cfg.copy()
182
+ priority = hook_cfg.pop('priority', 'NORMAL')
183
+ hook = build_from_cfg(hook_cfg, HOOKS)
184
+ runner.register_hook(hook, priority=priority)
185
+
186
+ if cfg.resume_from is None and cfg.get('auto_resume'):
187
+ resume_from = find_latest_checkpoint(cfg.work_dir)
188
+ if resume_from is not None:
189
+ cfg.resume_from = resume_from
190
+ if cfg.resume_from:
191
+ runner.resume(cfg.resume_from)
192
+ elif cfg.load_from:
193
+ runner.load_checkpoint(cfg.load_from)
194
+ runner.run(data_loaders, cfg.workflow)
data_utils/easyportrait/mmseg/core/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from .builder import (OPTIMIZER_BUILDERS, build_optimizer,
3
+ build_optimizer_constructor)
4
+ from .evaluation import * # noqa: F401, F403
5
+ from .hook import * # noqa: F401, F403
6
+ from .optimizers import * # noqa: F401, F403
7
+ from .seg import * # noqa: F401, F403
8
+ from .utils import * # noqa: F401, F403
9
+
10
+ __all__ = [
11
+ 'OPTIMIZER_BUILDERS', 'build_optimizer', 'build_optimizer_constructor'
12
+ ]
data_utils/easyportrait/mmseg/core/builder.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import copy
3
+
4
+ from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
5
+ from mmcv.utils import Registry, build_from_cfg
6
+
7
+ OPTIMIZER_BUILDERS = Registry(
8
+ 'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS)
9
+
10
+
11
+ def build_optimizer_constructor(cfg):
12
+ constructor_type = cfg.get('type')
13
+ if constructor_type in OPTIMIZER_BUILDERS:
14
+ return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
15
+ elif constructor_type in MMCV_OPTIMIZER_BUILDERS:
16
+ return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS)
17
+ else:
18
+ raise KeyError(f'{constructor_type} is not registered '
19
+ 'in the optimizer builder registry.')
20
+
21
+
22
+ def build_optimizer(model, cfg):
23
+ optimizer_cfg = copy.deepcopy(cfg)
24
+ constructor_type = optimizer_cfg.pop('constructor',
25
+ 'DefaultOptimizerConstructor')
26
+ paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
27
+ optim_constructor = build_optimizer_constructor(
28
+ dict(
29
+ type=constructor_type,
30
+ optimizer_cfg=optimizer_cfg,
31
+ paramwise_cfg=paramwise_cfg))
32
+ optimizer = optim_constructor(model)
33
+ return optimizer