kevin510 commited on
Commit
2d1c98e
·
verified ·
1 Parent(s): 2561ade

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .image_encoder import FastViTImageEncoder, FastViTImageConfig
2
+
3
+ __all__ = ["FastViTImageEncoder", "FastViTImageConfig"]
__pycache__/image_encoder.cpython-310.pyc ADDED
Binary file (2.08 kB). View file
 
__pycache__/mci.cpython-310.pyc ADDED
Binary file (35.3 kB). View file
 
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FastViTImageEncoder"
4
+ ],
5
+ "embed_dim": 3072,
6
+ "image_size": 1024,
7
+ "model_type": "fastvit_image_encoder",
8
+ "torch_dtype": "float16",
9
+ "transformers_version": "4.48.3",
10
+
11
+ "auto_map": {
12
+ "AutoConfig": "image_encoder.FastViTImageConfig",
13
+ "AutoModel": "image_encoder.FastViTImageEncoder",
14
+ "AutoImageProcessor": "transformers.CLIPImageProcessor"
15
+ }
16
+ }
image_encoder.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HF-compatible wrapper that turns the FastViT backbone into a pure *image encoder*.
3
+ Output: a single (B, embed_dim) vector obtained with the built-in GlobalPool2D head.
4
+ """
5
+ import torch
6
+ from transformers import PreTrainedModel, PretrainedConfig
7
+ from .mci import fastvithd, GlobalPool2D # imports your backbone factory
8
+
9
+
10
+ # ----------------------- Config -----------------------
11
+ class FastViTImageConfig(PretrainedConfig):
12
+ """Minimal config so HF knows the image size & embed dim."""
13
+ model_type = "fastvit_image_encoder"
14
+
15
+ def __init__(
16
+ self,
17
+ image_size: int = 1024,
18
+ embed_dim: int = 3072, # channels after conv_exp
19
+ **kwargs
20
+ ):
21
+ self.image_size = image_size
22
+ self.embed_dim = embed_dim
23
+ super().__init__(**kwargs)
24
+
25
+
26
+ # ----------------------- Model ------------------------
27
+ class FastViTImageEncoder(PreTrainedModel):
28
+ """
29
+ Wraps FastViT-HD and exposes an `.embeddings` output;
30
+ no text tower, no CLIP logits, only a pooled image embedding.
31
+ """
32
+ config_class = FastViTImageConfig
33
+ main_input_name = "pixel_values"
34
+
35
+ def __init__(self, config: FastViTImageConfig):
36
+ super().__init__(config)
37
+
38
+ # We **keep** GlobalPool2D by asking for `num_classes = embed_dim`
39
+ # (FastViT replaces the classifier with GlobalPool2D in that case).
40
+ self.backbone = fastvithd(num_classes=0)
41
+ self.backbone.head = GlobalPool2D(
42
+ in_dim = 3072,
43
+ out_dim = 768
44
+ )
45
+
46
+ # HF helper that registers weights for bf16/half-precision etc.
47
+ self.post_init()
48
+
49
+ # ------------------------------------------
50
+ def forward(self, pixel_values, return_dict=True, **unused):
51
+ """
52
+ Args:
53
+ pixel_values: (B, 3, H, W) tensor (already resized/normalized).
54
+ Returns:
55
+ Dict with a single key `"embeddings"` of shape (B, embed_dim).
56
+ """
57
+ # FastViT forward returns the pooled tensor directly because
58
+ # `num_classes == embed_dim` and head == GlobalPool2D.
59
+ embeddings = self.backbone(pixel_values) # (B, embed_dim)
60
+
61
+ if not return_dict:
62
+ return (embeddings,)
63
+
64
+ return {"embeddings": embeddings}
65
+
66
+ def forward(self, images):
67
+ return self.forward_images(images)
68
+
69
+ def feature_select(self, image_forward_outs):
70
+ # Features from penultimate layer
71
+ image_features = image_forward_outs["image_embeddings"]
72
+
73
+ # Reshape 4D tensor to 3D
74
+ B, C, H, W = image_features.shape
75
+ image_features = image_features.reshape(B, C, H*W)
76
+ image_features = image_features.transpose(1, 2)
77
+ return image_features
78
+
79
+ def forward_images(self, images):
80
+ if type(images) is list:
81
+ image_features = []
82
+ for image in images:
83
+ image_forward_out = self.backbone(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), return_image_embeddings=True)
84
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
85
+ image_features.append(image_feature)
86
+ else:
87
+ image_forward_outs = self.backbone(images.to(device=self.device, dtype=self.dtype), return_image_embeddings=True)
88
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
89
+
90
+ return image_features
mci.py ADDED
@@ -0,0 +1,1478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ import copy
6
+ from functools import partial
7
+ from typing import List, Tuple, Optional, Union, Dict
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ from torch import Tensor
12
+ import torch.nn.functional as F
13
+ from torch.nn.init import normal_
14
+
15
+ from timm.models import register_model
16
+ from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
17
+ from timm.layers import DropPath, SqueezeExcite
18
+
19
+
20
+ def _cfg(url="", **kwargs):
21
+ return {
22
+ "url": url,
23
+ "num_classes": 1000,
24
+ "input_size": (3, 256, 256),
25
+ "pool_size": None,
26
+ "crop_pct": 0.95,
27
+ "interpolation": "bicubic",
28
+ "mean": IMAGENET_DEFAULT_MEAN,
29
+ "std": IMAGENET_DEFAULT_STD,
30
+ "classifier": "head",
31
+ **kwargs,
32
+ }
33
+
34
+
35
+ default_cfgs = {
36
+ "fastvit_t": _cfg(crop_pct=0.9),
37
+ "fastvit_s": _cfg(crop_pct=0.9),
38
+ "fastvit_m": _cfg(crop_pct=0.95),
39
+ }
40
+
41
+
42
+ class SEBlock(nn.Module):
43
+ """Squeeze and Excite module.
44
+
45
+ Pytorch implementation of `Squeeze-and-Excitation Networks` -
46
+ https://arxiv.org/pdf/1709.01507.pdf
47
+ """
48
+
49
+ def __init__(self, in_channels: int, rd_ratio: float = 0.0625) -> None:
50
+ """Construct a Squeeze and Excite Module.
51
+
52
+ Args:
53
+ in_channels: Number of input channels.
54
+ rd_ratio: Input channel reduction ratio.
55
+ """
56
+ super(SEBlock, self).__init__()
57
+ self.reduce = nn.Conv2d(
58
+ in_channels=in_channels,
59
+ out_channels=int(in_channels * rd_ratio),
60
+ kernel_size=1,
61
+ stride=1,
62
+ bias=True,
63
+ )
64
+ self.expand = nn.Conv2d(
65
+ in_channels=int(in_channels * rd_ratio),
66
+ out_channels=in_channels,
67
+ kernel_size=1,
68
+ stride=1,
69
+ bias=True,
70
+ )
71
+
72
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
73
+ """Apply forward pass."""
74
+ b, c, h, w = inputs.size()
75
+ x = F.avg_pool2d(inputs, kernel_size=[h, w])
76
+ x = self.reduce(x)
77
+ x = F.relu(x)
78
+ x = self.expand(x)
79
+ x = torch.sigmoid(x)
80
+ x = x.view(-1, c, 1, 1)
81
+ return inputs * x
82
+
83
+
84
+ class MobileOneBlock(nn.Module):
85
+ """MobileOne building block.
86
+
87
+ This block has a multi-branched architecture at train-time
88
+ and plain-CNN style architecture at inference time
89
+ For more details, please refer to our paper:
90
+ `An Improved One millisecond Mobile Backbone` -
91
+ https://arxiv.org/pdf/2206.04040.pdf
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ in_channels: int,
97
+ out_channels: int,
98
+ kernel_size: int,
99
+ stride: int = 1,
100
+ padding: int = 0,
101
+ dilation: int = 1,
102
+ groups: int = 1,
103
+ inference_mode: bool = False,
104
+ use_se: bool = False,
105
+ use_act: bool = True,
106
+ use_scale_branch: bool = True,
107
+ num_conv_branches: int = 1,
108
+ activation: nn.Module = nn.GELU(),
109
+ ) -> None:
110
+ """Construct a MobileOneBlock module.
111
+
112
+ Args:
113
+ in_channels: Number of channels in the input.
114
+ out_channels: Number of channels produced by the block.
115
+ kernel_size: Size of the convolution kernel.
116
+ stride: Stride size.
117
+ padding: Zero-padding size.
118
+ dilation: Kernel dilation factor.
119
+ groups: Group number.
120
+ inference_mode: If True, instantiates model in inference mode.
121
+ use_se: Whether to use SE-ReLU activations.
122
+ use_act: Whether to use activation. Default: ``True``
123
+ use_scale_branch: Whether to use scale branch. Default: ``True``
124
+ num_conv_branches: Number of linear conv branches.
125
+ """
126
+ super(MobileOneBlock, self).__init__()
127
+ self.inference_mode = inference_mode
128
+ self.groups = groups
129
+ self.stride = stride
130
+ self.padding = padding
131
+ self.dilation = dilation
132
+ self.kernel_size = kernel_size
133
+ self.in_channels = in_channels
134
+ self.out_channels = out_channels
135
+ self.num_conv_branches = num_conv_branches
136
+
137
+ # Check if SE-ReLU is requested
138
+ if use_se:
139
+ self.se = SEBlock(out_channels)
140
+ else:
141
+ self.se = nn.Identity()
142
+
143
+ if use_act:
144
+ self.activation = activation
145
+ else:
146
+ self.activation = nn.Identity()
147
+
148
+ if inference_mode:
149
+ self.reparam_conv = nn.Conv2d(
150
+ in_channels=in_channels,
151
+ out_channels=out_channels,
152
+ kernel_size=kernel_size,
153
+ stride=stride,
154
+ padding=padding,
155
+ dilation=dilation,
156
+ groups=groups,
157
+ bias=True,
158
+ )
159
+ else:
160
+ # Re-parameterizable skip connection
161
+ # Fallback, sometimes batchnorm tensors
162
+ # do not get instantiated correctly on some processes
163
+ # when using deepspeed + accelerate
164
+ norm_layer = nn.BatchNorm2d(num_features=in_channels)
165
+ if norm_layer.weight.shape[0] == 0:
166
+ norm_layer.weight = nn.Parameter(torch.zeros(in_channels))
167
+ if norm_layer.bias.shape[0] == 0:
168
+ norm_layer.bias = nn.Parameter(torch.zeros(in_channels))
169
+
170
+ self.rbr_skip = (
171
+ norm_layer
172
+ if out_channels == in_channels and stride == 1
173
+ else None
174
+ )
175
+
176
+ # Re-parameterizable conv branches
177
+ if num_conv_branches > 0:
178
+ rbr_conv = list()
179
+ for _ in range(self.num_conv_branches):
180
+ rbr_conv.append(
181
+ self._conv_bn(kernel_size=kernel_size, padding=padding)
182
+ )
183
+ self.rbr_conv = nn.ModuleList(rbr_conv)
184
+ else:
185
+ self.rbr_conv = None
186
+
187
+ # Re-parameterizable scale branch
188
+ self.rbr_scale = None
189
+ if not isinstance(kernel_size, int):
190
+ kernel_size = kernel_size[0]
191
+ if (kernel_size > 1) and use_scale_branch:
192
+ self.rbr_scale = self._conv_bn(kernel_size=1, padding=0)
193
+
194
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
195
+ """Apply forward pass."""
196
+ # Inference mode forward pass.
197
+ if self.inference_mode:
198
+ return self.activation(self.se(self.reparam_conv(x)))
199
+
200
+ # Multi-branched train-time forward pass.
201
+ # Skip branch output
202
+ identity_out = 0
203
+ if self.rbr_skip is not None:
204
+ identity_out = self.rbr_skip(x)
205
+
206
+ # Scale branch output
207
+ scale_out = 0
208
+ if self.rbr_scale is not None:
209
+ scale_out = self.rbr_scale(x)
210
+
211
+ # Other branches
212
+ out = scale_out + identity_out
213
+ if self.rbr_conv is not None:
214
+ for ix in range(self.num_conv_branches):
215
+ out += self.rbr_conv[ix](x)
216
+
217
+ return self.activation(self.se(out))
218
+
219
+ def reparameterize(self):
220
+ """Following works like `RepVGG: Making VGG-style ConvNets Great Again` -
221
+ https://arxiv.org/pdf/2101.03697.pdf. We re-parameterize multi-branched
222
+ architecture used at training time to obtain a plain CNN-like structure
223
+ for inference.
224
+ """
225
+ if self.inference_mode:
226
+ return
227
+ kernel, bias = self._get_kernel_bias()
228
+ self.reparam_conv = nn.Conv2d(
229
+ in_channels=self.in_channels,
230
+ out_channels=self.out_channels,
231
+ kernel_size=self.kernel_size,
232
+ stride=self.stride,
233
+ padding=self.padding,
234
+ dilation=self.dilation,
235
+ groups=self.groups,
236
+ bias=True,
237
+ )
238
+ self.reparam_conv.weight.data = kernel
239
+ self.reparam_conv.bias.data = bias
240
+
241
+ # Delete un-used branches
242
+ self.__delattr__("rbr_conv")
243
+ self.__delattr__("rbr_scale")
244
+ if hasattr(self, "rbr_skip"):
245
+ self.__delattr__("rbr_skip")
246
+
247
+ self.inference_mode = True
248
+
249
+ def _get_kernel_bias(self) -> Tuple[torch.Tensor, torch.Tensor]:
250
+ """Method to obtain re-parameterized kernel and bias.
251
+ Reference: https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L83
252
+
253
+ Returns:
254
+ Tuple of (kernel, bias) after fusing branches.
255
+ """
256
+ # get weights and bias of scale branch
257
+ kernel_scale = 0
258
+ bias_scale = 0
259
+ if self.rbr_scale is not None:
260
+ kernel_scale, bias_scale = self._fuse_bn_tensor(self.rbr_scale)
261
+ # Pad scale branch kernel to match conv branch kernel size.
262
+ pad = self.kernel_size // 2
263
+ kernel_scale = torch.nn.functional.pad(kernel_scale, [pad, pad, pad, pad])
264
+
265
+ # get weights and bias of skip branch
266
+ kernel_identity = 0
267
+ bias_identity = 0
268
+ if self.rbr_skip is not None:
269
+ kernel_identity, bias_identity = self._fuse_bn_tensor(self.rbr_skip)
270
+
271
+ # get weights and bias of conv branches
272
+ kernel_conv = 0
273
+ bias_conv = 0
274
+ if self.rbr_conv is not None:
275
+ for ix in range(self.num_conv_branches):
276
+ _kernel, _bias = self._fuse_bn_tensor(self.rbr_conv[ix])
277
+ kernel_conv += _kernel
278
+ bias_conv += _bias
279
+
280
+ kernel_final = kernel_conv + kernel_scale + kernel_identity
281
+ bias_final = bias_conv + bias_scale + bias_identity
282
+ return kernel_final, bias_final
283
+
284
+ def _fuse_bn_tensor(
285
+ self, branch: Union[nn.Sequential, nn.BatchNorm2d]
286
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
287
+ """Method to fuse batchnorm layer with preceeding conv layer.
288
+ Reference: https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L95
289
+
290
+ Args:
291
+ branch: Sequence of ops to be fused.
292
+
293
+ Returns:
294
+ Tuple of (kernel, bias) after fusing batchnorm.
295
+ """
296
+ if isinstance(branch, nn.Sequential):
297
+ kernel = branch.conv.weight
298
+ running_mean = branch.bn.running_mean
299
+ running_var = branch.bn.running_var
300
+ gamma = branch.bn.weight
301
+ beta = branch.bn.bias
302
+ eps = branch.bn.eps
303
+ else:
304
+ assert isinstance(branch, nn.BatchNorm2d)
305
+ if not hasattr(self, "id_tensor"):
306
+ input_dim = self.in_channels // self.groups
307
+
308
+ kernel_size = self.kernel_size
309
+ if isinstance(self.kernel_size, int):
310
+ kernel_size = (self.kernel_size, self.kernel_size)
311
+
312
+ kernel_value = torch.zeros(
313
+ (self.in_channels, input_dim, kernel_size[0], kernel_size[1]),
314
+ dtype=branch.weight.dtype,
315
+ device=branch.weight.device,
316
+ )
317
+ for i in range(self.in_channels):
318
+ kernel_value[
319
+ i, i % input_dim, kernel_size[0] // 2, kernel_size[1] // 2
320
+ ] = 1
321
+ self.id_tensor = kernel_value
322
+ kernel = self.id_tensor
323
+ running_mean = branch.running_mean
324
+ running_var = branch.running_var
325
+ gamma = branch.weight
326
+ beta = branch.bias
327
+ eps = branch.eps
328
+ std = (running_var + eps).sqrt()
329
+ t = (gamma / std).reshape(-1, 1, 1, 1)
330
+ return kernel * t, beta - running_mean * gamma / std
331
+
332
+ def _conv_bn(self, kernel_size: int, padding: int) -> nn.Sequential:
333
+ """Helper method to construct conv-batchnorm layers.
334
+
335
+ Args:
336
+ kernel_size: Size of the convolution kernel.
337
+ padding: Zero-padding size.
338
+
339
+ Returns:
340
+ Conv-BN module.
341
+ """
342
+ # Fallback, sometimes batchnorm tensors
343
+ # do not get instantiated correctly on some processes
344
+ # when using deepspeed + accelerate
345
+ norm_layer = nn.BatchNorm2d(num_features=self.out_channels)
346
+ if norm_layer.weight.shape[0] == 0:
347
+ norm_layer.weight = nn.Parameter(torch.zeros(self.out_channels))
348
+ if norm_layer.bias.shape[0] == 0:
349
+ norm_layer.bias = nn.Parameter(torch.zeros(self.out_channels))
350
+
351
+ mod_list = nn.Sequential()
352
+ mod_list.add_module(
353
+ "conv",
354
+ nn.Conv2d(
355
+ in_channels=self.in_channels,
356
+ out_channels=self.out_channels,
357
+ kernel_size=kernel_size,
358
+ stride=self.stride,
359
+ padding=padding,
360
+ groups=self.groups,
361
+ bias=False,
362
+ ),
363
+ )
364
+ mod_list.add_module("bn", norm_layer)
365
+ return mod_list
366
+
367
+
368
+ class ReparamLargeKernelConv(nn.Module):
369
+ """Building Block of RepLKNet
370
+
371
+ This class defines overparameterized large kernel conv block
372
+ introduced in `RepLKNet <https://arxiv.org/abs/2203.06717>`_
373
+
374
+ Reference: https://github.com/DingXiaoH/RepLKNet-pytorch
375
+ """
376
+
377
+ def __init__(
378
+ self,
379
+ in_channels: int,
380
+ out_channels: int,
381
+ kernel_size: int,
382
+ stride: int,
383
+ groups: int,
384
+ small_kernel: int,
385
+ inference_mode: bool = False,
386
+ use_se: bool = False,
387
+ activation: nn.Module = nn.GELU(),
388
+ ) -> None:
389
+ """Construct a ReparamLargeKernelConv module.
390
+
391
+ Args:
392
+ in_channels: Number of input channels.
393
+ out_channels: Number of output channels.
394
+ kernel_size: Kernel size of the large kernel conv branch.
395
+ stride: Stride size. Default: 1
396
+ groups: Group number. Default: 1
397
+ small_kernel: Kernel size of small kernel conv branch.
398
+ inference_mode: If True, instantiates model in inference mode. Default: ``False``
399
+ activation: Activation module. Default: ``nn.GELU``
400
+ """
401
+ super(ReparamLargeKernelConv, self).__init__()
402
+
403
+ self.stride = stride
404
+ self.groups = groups
405
+ self.in_channels = in_channels
406
+ self.out_channels = out_channels
407
+ self.activation = activation
408
+
409
+ self.kernel_size = kernel_size
410
+ self.small_kernel = small_kernel
411
+ self.padding = kernel_size // 2
412
+
413
+ # Check if SE is requested
414
+ if use_se:
415
+ self.se = SqueezeExcite(out_channels, rd_ratio=0.25)
416
+ else:
417
+ self.se = nn.Identity()
418
+
419
+ if inference_mode:
420
+ self.lkb_reparam = nn.Conv2d(
421
+ in_channels=in_channels,
422
+ out_channels=out_channels,
423
+ kernel_size=kernel_size,
424
+ stride=stride,
425
+ padding=self.padding,
426
+ dilation=1,
427
+ groups=groups,
428
+ bias=True,
429
+ )
430
+ else:
431
+ self.lkb_origin = self._conv_bn(
432
+ kernel_size=kernel_size, padding=self.padding
433
+ )
434
+ if small_kernel is not None:
435
+ assert (
436
+ small_kernel <= kernel_size
437
+ ), "The kernel size for re-param cannot be larger than the large kernel!"
438
+ self.small_conv = self._conv_bn(
439
+ kernel_size=small_kernel, padding=small_kernel // 2
440
+ )
441
+
442
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
443
+ """Apply forward pass."""
444
+ if hasattr(self, "lkb_reparam"):
445
+ out = self.lkb_reparam(x)
446
+ else:
447
+ out = self.lkb_origin(x)
448
+ if hasattr(self, "small_conv"):
449
+ out += self.small_conv(x)
450
+
451
+ return self.activation(self.se(out))
452
+
453
+ def get_kernel_bias(self) -> Tuple[torch.Tensor, torch.Tensor]:
454
+ """Method to obtain re-parameterized kernel and bias.
455
+ Reference: https://github.com/DingXiaoH/RepLKNet-pytorch
456
+
457
+ Returns:
458
+ Tuple of (kernel, bias) after fusing branches.
459
+ """
460
+ eq_k, eq_b = self._fuse_bn(self.lkb_origin.conv, self.lkb_origin.bn)
461
+ if hasattr(self, "small_conv"):
462
+ small_k, small_b = self._fuse_bn(self.small_conv.conv, self.small_conv.bn)
463
+ eq_b += small_b
464
+ eq_k += nn.functional.pad(
465
+ small_k, [(self.kernel_size - self.small_kernel) // 2] * 4
466
+ )
467
+ return eq_k, eq_b
468
+
469
+ def reparameterize(self) -> None:
470
+ """
471
+ Following works like `RepVGG: Making VGG-style ConvNets Great Again` -
472
+ https://arxiv.org/pdf/2101.03697.pdf. We re-parameterize multi-branched
473
+ architecture used at training time to obtain a plain CNN-like structure
474
+ for inference.
475
+ """
476
+ eq_k, eq_b = self.get_kernel_bias()
477
+ self.lkb_reparam = nn.Conv2d(
478
+ in_channels=self.in_channels,
479
+ out_channels=self.out_channels,
480
+ kernel_size=self.kernel_size,
481
+ stride=self.stride,
482
+ padding=self.padding,
483
+ dilation=self.lkb_origin.conv.dilation,
484
+ groups=self.groups,
485
+ bias=True,
486
+ )
487
+
488
+ self.lkb_reparam.weight.data = eq_k
489
+ self.lkb_reparam.bias.data = eq_b
490
+ self.__delattr__("lkb_origin")
491
+ if hasattr(self, "small_conv"):
492
+ self.__delattr__("small_conv")
493
+
494
+ @staticmethod
495
+ def _fuse_bn(
496
+ conv: torch.Tensor, bn: nn.BatchNorm2d
497
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
498
+ """Method to fuse batchnorm layer with conv layer.
499
+
500
+ Args:
501
+ conv: Convolutional kernel weights.
502
+ bn: Batchnorm 2d layer.
503
+
504
+ Returns:
505
+ Tuple of (kernel, bias) after fusing batchnorm.
506
+ """
507
+ kernel = conv.weight
508
+ running_mean = bn.running_mean
509
+ running_var = bn.running_var
510
+ gamma = bn.weight
511
+ beta = bn.bias
512
+ eps = bn.eps
513
+ std = (running_var + eps).sqrt()
514
+ t = (gamma / std).reshape(-1, 1, 1, 1)
515
+ return kernel * t, beta - running_mean * gamma / std
516
+
517
+ def _conv_bn(self, kernel_size: int, padding: int = 0) -> nn.Sequential:
518
+ """Helper method to construct conv-batchnorm layers.
519
+
520
+ Args:
521
+ kernel_size: Size of the convolution kernel.
522
+ padding: Zero-padding size.
523
+
524
+ Returns:
525
+ A nn.Sequential Conv-BN module.
526
+ """
527
+ # Fallback, sometimes batchnorm tensors
528
+ # do not get instantiated correctly on some processes
529
+ # when using deepspeed + accelerate
530
+ norm_layer = nn.BatchNorm2d(num_features=self.out_channels)
531
+ if norm_layer.weight.shape[0] == 0:
532
+ norm_layer.weight = nn.Parameter(torch.zeros(self.out_channels))
533
+ if norm_layer.bias.shape[0] == 0:
534
+ norm_layer.bias = nn.Parameter(torch.zeros(self.out_channels))
535
+
536
+ mod_list = nn.Sequential()
537
+ mod_list.add_module(
538
+ "conv",
539
+ nn.Conv2d(
540
+ in_channels=self.in_channels,
541
+ out_channels=self.out_channels,
542
+ kernel_size=kernel_size,
543
+ stride=self.stride,
544
+ padding=padding,
545
+ groups=self.groups,
546
+ bias=False,
547
+ ),
548
+ )
549
+ mod_list.add_module("bn", norm_layer)
550
+ return mod_list
551
+
552
+
553
+ def convolutional_stem(
554
+ in_channels: int, out_channels: int, inference_mode: bool = False, use_scale_branch: bool = True,
555
+ ) -> nn.Sequential:
556
+ """Build convolutional stem with MobileOne blocks.
557
+
558
+ Args:
559
+ in_channels: Number of input channels.
560
+ out_channels: Number of output channels.
561
+ inference_mode: Flag to instantiate model in inference mode. Default: ``False``
562
+
563
+ Returns:
564
+ nn.Sequential object with stem elements.
565
+ """
566
+ return nn.Sequential(
567
+ MobileOneBlock(
568
+ in_channels=in_channels,
569
+ out_channels=out_channels,
570
+ kernel_size=3,
571
+ stride=2,
572
+ padding=1,
573
+ groups=1,
574
+ inference_mode=inference_mode,
575
+ use_se=False,
576
+ num_conv_branches=1,
577
+ use_scale_branch=use_scale_branch
578
+ ),
579
+ MobileOneBlock(
580
+ in_channels=out_channels,
581
+ out_channels=out_channels,
582
+ kernel_size=3,
583
+ stride=2,
584
+ padding=1,
585
+ groups=out_channels,
586
+ inference_mode=inference_mode,
587
+ use_se=False,
588
+ num_conv_branches=1,
589
+ use_scale_branch=use_scale_branch
590
+ ),
591
+ MobileOneBlock(
592
+ in_channels=out_channels,
593
+ out_channels=out_channels,
594
+ kernel_size=1,
595
+ stride=1,
596
+ padding=0,
597
+ groups=1,
598
+ inference_mode=inference_mode,
599
+ use_se=False,
600
+ num_conv_branches=1,
601
+ use_scale_branch=use_scale_branch
602
+ ),
603
+ )
604
+
605
+
606
+ class LayerNormChannel(nn.Module):
607
+ """
608
+ LayerNorm only for Channel Dimension.
609
+ Input: tensor in shape [B, C, H, W]
610
+ """
611
+ def __init__(self, num_features, eps=1e-05) -> None:
612
+ super().__init__()
613
+ self.weight = nn.Parameter(torch.ones(num_features))
614
+ self.bias = nn.Parameter(torch.zeros(num_features))
615
+ self.eps = eps
616
+
617
+ def forward(self, x) -> torch.Tensor:
618
+ u = x.mean(1, keepdim=True)
619
+ s = (x - u).pow(2).mean(1, keepdim=True)
620
+ x = (x - u) / torch.sqrt(s + self.eps)
621
+ x = self.weight.unsqueeze(-1).unsqueeze(-1) * x \
622
+ + self.bias.unsqueeze(-1).unsqueeze(-1)
623
+ return x
624
+
625
+
626
+ class MHSA(nn.Module):
627
+ """Multi-headed Self Attention module.
628
+
629
+ Source modified from:
630
+ https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
631
+ """
632
+
633
+ def __init__(
634
+ self,
635
+ dim: int,
636
+ head_dim: int = 32,
637
+ qkv_bias: bool = False,
638
+ attn_drop: float = 0.0,
639
+ proj_drop: float = 0.0,
640
+ ) -> None:
641
+ """Build MHSA module that can handle 3D or 4D input tensors.
642
+
643
+ Args:
644
+ dim: Number of embedding dimensions.
645
+ head_dim: Number of hidden dimensions per head. Default: ``32``
646
+ qkv_bias: Use bias or not. Default: ``False``
647
+ attn_drop: Dropout rate for attention tensor.
648
+ proj_drop: Dropout rate for projection tensor.
649
+ """
650
+ super().__init__()
651
+ assert dim % head_dim == 0, "dim should be divisible by head_dim"
652
+ self.head_dim = head_dim
653
+ self.num_heads = dim // head_dim
654
+ self.scale = head_dim**-0.5
655
+
656
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
657
+ self.attn_drop = nn.Dropout(attn_drop)
658
+ self.proj = nn.Linear(dim, dim)
659
+ self.proj_drop = nn.Dropout(proj_drop)
660
+
661
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
662
+ shape = x.shape
663
+ B, C, H, W = shape
664
+ N = H * W
665
+ if len(shape) == 4:
666
+ x = torch.flatten(x, start_dim=2).transpose(-2, -1) # (B, N, C)
667
+ qkv = (
668
+ self.qkv(x)
669
+ .reshape(B, N, 3, self.num_heads, self.head_dim)
670
+ .permute(2, 0, 3, 1, 4)
671
+ )
672
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
673
+
674
+ # trick here to make [email protected] more stable
675
+ attn = (q * self.scale) @ k.transpose(-2, -1)
676
+ attn = attn.softmax(dim=-1)
677
+ attn = self.attn_drop(attn)
678
+
679
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
680
+ x = self.proj(x)
681
+ x = self.proj_drop(x)
682
+ if len(shape) == 4:
683
+ x = x.transpose(-2, -1).reshape(B, C, H, W)
684
+
685
+ return x
686
+
687
+
688
+ class PatchEmbed(nn.Module):
689
+ """Convolutional patch embedding layer."""
690
+
691
+ def __init__(
692
+ self,
693
+ patch_size: int,
694
+ stride: int,
695
+ in_channels: int,
696
+ embed_dim: int,
697
+ inference_mode: bool = False,
698
+ use_se: bool = False,
699
+ ) -> None:
700
+ """Build patch embedding layer.
701
+
702
+ Args:
703
+ patch_size: Patch size for embedding computation.
704
+ stride: Stride for convolutional embedding layer.
705
+ in_channels: Number of channels of input tensor.
706
+ embed_dim: Number of embedding dimensions.
707
+ inference_mode: Flag to instantiate model in inference mode. Default: ``False``
708
+ use_se: If ``True`` SE block will be used.
709
+ """
710
+ super().__init__()
711
+ block = list()
712
+ block.append(
713
+ ReparamLargeKernelConv(
714
+ in_channels=in_channels,
715
+ out_channels=embed_dim,
716
+ kernel_size=patch_size,
717
+ stride=stride,
718
+ groups=in_channels,
719
+ small_kernel=3,
720
+ inference_mode=inference_mode,
721
+ use_se=use_se,
722
+ )
723
+ )
724
+ block.append(
725
+ MobileOneBlock(
726
+ in_channels=embed_dim,
727
+ out_channels=embed_dim,
728
+ kernel_size=1,
729
+ stride=1,
730
+ padding=0,
731
+ groups=1,
732
+ inference_mode=inference_mode,
733
+ use_se=False,
734
+ num_conv_branches=1,
735
+ )
736
+ )
737
+ self.proj = nn.Sequential(*block)
738
+
739
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
740
+ x = self.proj(x)
741
+ return x
742
+
743
+
744
+ class RepMixer(nn.Module):
745
+ """Reparameterizable token mixer.
746
+
747
+ For more details, please refer to our paper:
748
+ `FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization <https://arxiv.org/pdf/2303.14189.pdf>`_
749
+ """
750
+
751
+ def __init__(
752
+ self,
753
+ dim,
754
+ kernel_size=3,
755
+ use_layer_scale=True,
756
+ layer_scale_init_value=1e-5,
757
+ inference_mode: bool = False,
758
+ ):
759
+ """Build RepMixer Module.
760
+
761
+ Args:
762
+ dim: Input feature map dimension. :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, H, W)`.
763
+ kernel_size: Kernel size for spatial mixing. Default: 3
764
+ use_layer_scale: If True, learnable layer scale is used. Default: ``True``
765
+ layer_scale_init_value: Initial value for layer scale. Default: 1e-5
766
+ inference_mode: If True, instantiates model in inference mode. Default: ``False``
767
+ """
768
+ super().__init__()
769
+ self.dim = dim
770
+ self.kernel_size = kernel_size
771
+ self.inference_mode = inference_mode
772
+
773
+ if inference_mode:
774
+ self.reparam_conv = nn.Conv2d(
775
+ in_channels=self.dim,
776
+ out_channels=self.dim,
777
+ kernel_size=self.kernel_size,
778
+ stride=1,
779
+ padding=self.kernel_size // 2,
780
+ groups=self.dim,
781
+ bias=True,
782
+ )
783
+ else:
784
+ self.norm = MobileOneBlock(
785
+ dim,
786
+ dim,
787
+ kernel_size,
788
+ padding=kernel_size // 2,
789
+ groups=dim,
790
+ use_act=False,
791
+ use_scale_branch=False,
792
+ num_conv_branches=0,
793
+ )
794
+ self.mixer = MobileOneBlock(
795
+ dim,
796
+ dim,
797
+ kernel_size,
798
+ padding=kernel_size // 2,
799
+ groups=dim,
800
+ use_act=False,
801
+ )
802
+ self.use_layer_scale = use_layer_scale
803
+ if use_layer_scale:
804
+ self.layer_scale = nn.Parameter(
805
+ layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True
806
+ )
807
+
808
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
809
+ if hasattr(self, "reparam_conv"):
810
+ x = self.reparam_conv(x)
811
+ return x
812
+ else:
813
+ if self.use_layer_scale:
814
+ x = x + self.layer_scale * (self.mixer(x) - self.norm(x))
815
+ else:
816
+ x = x + self.mixer(x) - self.norm(x)
817
+ return x
818
+
819
+ def reparameterize(self) -> None:
820
+ """Reparameterize mixer and norm into a single
821
+ convolutional layer for efficient inference.
822
+ """
823
+ if self.inference_mode:
824
+ return
825
+
826
+ self.mixer.reparameterize()
827
+ self.norm.reparameterize()
828
+
829
+ if self.use_layer_scale:
830
+ w = self.mixer.id_tensor + self.layer_scale.unsqueeze(-1) * (
831
+ self.mixer.reparam_conv.weight - self.norm.reparam_conv.weight
832
+ )
833
+ b = torch.squeeze(self.layer_scale) * (
834
+ self.mixer.reparam_conv.bias - self.norm.reparam_conv.bias
835
+ )
836
+ else:
837
+ w = (
838
+ self.mixer.id_tensor
839
+ + self.mixer.reparam_conv.weight
840
+ - self.norm.reparam_conv.weight
841
+ )
842
+ b = self.mixer.reparam_conv.bias - self.norm.reparam_conv.bias
843
+
844
+ self.reparam_conv = nn.Conv2d(
845
+ in_channels=self.dim,
846
+ out_channels=self.dim,
847
+ kernel_size=self.kernel_size,
848
+ stride=1,
849
+ padding=self.kernel_size // 2,
850
+ groups=self.dim,
851
+ bias=True,
852
+ )
853
+ self.reparam_conv.weight.data = w
854
+ self.reparam_conv.bias.data = b
855
+
856
+ self.__delattr__("mixer")
857
+ self.__delattr__("norm")
858
+ if self.use_layer_scale:
859
+ self.__delattr__("layer_scale")
860
+
861
+
862
+ class ConvFFN(nn.Module):
863
+ """Convolutional FFN Module."""
864
+
865
+ def __init__(
866
+ self,
867
+ in_channels: int,
868
+ hidden_channels: Optional[int] = None,
869
+ out_channels: Optional[int] = None,
870
+ act_layer: nn.Module = nn.GELU,
871
+ drop: float = 0.0,
872
+ ) -> None:
873
+ """Build convolutional FFN module.
874
+
875
+ Args:
876
+ in_channels: Number of input channels.
877
+ hidden_channels: Number of channels after expansion. Default: None
878
+ out_channels: Number of output channels. Default: None
879
+ act_layer: Activation layer. Default: ``GELU``
880
+ drop: Dropout rate. Default: ``0.0``.
881
+ """
882
+ super().__init__()
883
+ out_channels = out_channels or in_channels
884
+ hidden_channels = hidden_channels or in_channels
885
+ self.conv = nn.Sequential()
886
+ self.conv.add_module(
887
+ "conv",
888
+ nn.Conv2d(
889
+ in_channels=in_channels,
890
+ out_channels=out_channels,
891
+ kernel_size=7,
892
+ padding=3,
893
+ groups=in_channels,
894
+ bias=False,
895
+ ),
896
+ )
897
+
898
+ # Fallback, sometimes batchnorm tensors
899
+ # do not get instantiated correctly on some processes
900
+ # when using deepspeed + accelerate
901
+ norm_layer = nn.BatchNorm2d(num_features=out_channels)
902
+ if norm_layer.weight.shape[0] == 0:
903
+ norm_layer.weight = nn.Parameter(torch.zeros(out_channels))
904
+ if norm_layer.bias.shape[0] == 0:
905
+ norm_layer.bias = nn.Parameter(torch.zeros(out_channels))
906
+
907
+ self.conv.add_module("bn", norm_layer)
908
+ self.fc1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
909
+ self.act = act_layer()
910
+ self.fc2 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)
911
+ self.drop = nn.Dropout(drop)
912
+ self.apply(self._init_weights)
913
+
914
+ def _init_weights(self, m: nn.Module) -> None:
915
+ if isinstance(m, nn.Conv2d):
916
+ normal_(m.weight, std=0.02)
917
+ if m.bias is not None:
918
+ nn.init.constant_(m.bias, 0)
919
+
920
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
921
+ x = self.conv(x)
922
+ x = self.fc1(x)
923
+ x = self.act(x)
924
+ x = self.drop(x)
925
+ x = self.fc2(x)
926
+ x = self.drop(x)
927
+ return x
928
+
929
+
930
+ class RepCPE(nn.Module):
931
+ """Implementation of conditional positional encoding.
932
+
933
+ For more details refer to paper:
934
+ `Conditional Positional Encodings for Vision Transformers <https://arxiv.org/pdf/2102.10882.pdf>`_
935
+
936
+ In our implementation, we can reparameterize this module to eliminate a skip connection.
937
+ """
938
+
939
+ def __init__(
940
+ self,
941
+ in_channels: int,
942
+ embed_dim: int = 768,
943
+ spatial_shape: Union[int, Tuple[int, int]] = (7, 7),
944
+ inference_mode=False,
945
+ ) -> None:
946
+ """Build reparameterizable conditional positional encoding
947
+
948
+ Args:
949
+ in_channels: Number of input channels.
950
+ embed_dim: Number of embedding dimensions. Default: 768
951
+ spatial_shape: Spatial shape of kernel for positional encoding. Default: (7, 7)
952
+ inference_mode: Flag to instantiate block in inference mode. Default: ``False``
953
+ """
954
+ super(RepCPE, self).__init__()
955
+ if isinstance(spatial_shape, int):
956
+ spatial_shape = tuple([spatial_shape] * 2)
957
+ assert isinstance(spatial_shape, Tuple), (
958
+ f'"spatial_shape" must by a sequence or int, '
959
+ f"get {type(spatial_shape)} instead."
960
+ )
961
+ assert len(spatial_shape) == 2, (
962
+ f'Length of "spatial_shape" should be 2, '
963
+ f"got {len(spatial_shape)} instead."
964
+ )
965
+
966
+ self.spatial_shape = spatial_shape
967
+ self.embed_dim = embed_dim
968
+ self.in_channels = in_channels
969
+ self.groups = embed_dim
970
+
971
+ if inference_mode:
972
+ self.reparam_conv = nn.Conv2d(
973
+ in_channels=self.in_channels,
974
+ out_channels=self.embed_dim,
975
+ kernel_size=self.spatial_shape,
976
+ stride=1,
977
+ padding=int(self.spatial_shape[0] // 2),
978
+ groups=self.embed_dim,
979
+ bias=True,
980
+ )
981
+ else:
982
+ self.pe = nn.Conv2d(
983
+ in_channels,
984
+ embed_dim,
985
+ spatial_shape,
986
+ 1,
987
+ int(spatial_shape[0] // 2),
988
+ bias=True,
989
+ groups=embed_dim,
990
+ )
991
+
992
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
993
+ if hasattr(self, "reparam_conv"):
994
+ x = self.reparam_conv(x)
995
+ return x
996
+ else:
997
+ x = self.pe(x) + x
998
+ return x
999
+
1000
+ def reparameterize(self) -> None:
1001
+ # Build equivalent Id tensor
1002
+ input_dim = self.in_channels // self.groups
1003
+ kernel_value = torch.zeros(
1004
+ (
1005
+ self.in_channels,
1006
+ input_dim,
1007
+ self.spatial_shape[0],
1008
+ self.spatial_shape[1],
1009
+ ),
1010
+ dtype=self.pe.weight.dtype,
1011
+ device=self.pe.weight.device,
1012
+ )
1013
+ for i in range(self.in_channels):
1014
+ kernel_value[
1015
+ i,
1016
+ i % input_dim,
1017
+ self.spatial_shape[0] // 2,
1018
+ self.spatial_shape[1] // 2,
1019
+ ] = 1
1020
+ id_tensor = kernel_value
1021
+
1022
+ # Reparameterize Id tensor and conv
1023
+ w_final = id_tensor + self.pe.weight
1024
+ b_final = self.pe.bias
1025
+
1026
+ # Introduce reparam conv
1027
+ self.reparam_conv = nn.Conv2d(
1028
+ in_channels=self.in_channels,
1029
+ out_channels=self.embed_dim,
1030
+ kernel_size=self.spatial_shape,
1031
+ stride=1,
1032
+ padding=int(self.spatial_shape[0] // 2),
1033
+ groups=self.embed_dim,
1034
+ bias=True,
1035
+ )
1036
+ self.reparam_conv.weight.data = w_final
1037
+ self.reparam_conv.bias.data = b_final
1038
+
1039
+ self.__delattr__("pe")
1040
+
1041
+
1042
+ class RepMixerBlock(nn.Module):
1043
+ """Implementation of Metaformer block with RepMixer as token mixer.
1044
+
1045
+ For more details on Metaformer structure, please refer to:
1046
+ `MetaFormer Is Actually What You Need for Vision <https://arxiv.org/pdf/2111.11418.pdf>`_
1047
+ """
1048
+
1049
+ def __init__(
1050
+ self,
1051
+ dim: int,
1052
+ kernel_size: int = 3,
1053
+ mlp_ratio: float = 4.0,
1054
+ act_layer: nn.Module = nn.GELU,
1055
+ drop: float = 0.0,
1056
+ drop_path: float = 0.0,
1057
+ use_layer_scale: bool = True,
1058
+ layer_scale_init_value: float = 1e-5,
1059
+ inference_mode: bool = False,
1060
+ ):
1061
+ """Build RepMixer Block.
1062
+
1063
+ Args:
1064
+ dim: Number of embedding dimensions.
1065
+ kernel_size: Kernel size for repmixer. Default: 3
1066
+ mlp_ratio: MLP expansion ratio. Default: 4.0
1067
+ act_layer: Activation layer. Default: ``nn.GELU``
1068
+ drop: Dropout rate. Default: 0.0
1069
+ drop_path: Drop path rate. Default: 0.0
1070
+ use_layer_scale: Flag to turn on layer scale. Default: ``True``
1071
+ layer_scale_init_value: Layer scale value at initialization. Default: 1e-5
1072
+ inference_mode: Flag to instantiate block in inference mode. Default: ``False``
1073
+ """
1074
+
1075
+ super().__init__()
1076
+
1077
+ self.token_mixer = RepMixer(
1078
+ dim,
1079
+ kernel_size=kernel_size,
1080
+ use_layer_scale=use_layer_scale,
1081
+ layer_scale_init_value=layer_scale_init_value,
1082
+ inference_mode=inference_mode,
1083
+ )
1084
+
1085
+ assert mlp_ratio > 0, "MLP ratio should be greater than 0, found: {}".format(
1086
+ mlp_ratio
1087
+ )
1088
+ mlp_hidden_dim = int(dim * mlp_ratio)
1089
+ self.convffn = ConvFFN(
1090
+ in_channels=dim,
1091
+ hidden_channels=mlp_hidden_dim,
1092
+ act_layer=act_layer,
1093
+ drop=drop,
1094
+ )
1095
+
1096
+ # Drop Path
1097
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
1098
+
1099
+ # Layer Scale
1100
+ self.use_layer_scale = use_layer_scale
1101
+ if use_layer_scale:
1102
+ self.layer_scale = nn.Parameter(
1103
+ layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True
1104
+ )
1105
+
1106
+ def forward(self, x):
1107
+ if self.use_layer_scale:
1108
+ x = self.token_mixer(x)
1109
+ x = x + self.drop_path(self.layer_scale * self.convffn(x))
1110
+ else:
1111
+ x = self.token_mixer(x)
1112
+ x = x + self.drop_path(self.convffn(x))
1113
+ return x
1114
+
1115
+
1116
+ class AttentionBlock(nn.Module):
1117
+ """Implementation of metaformer block with MHSA as token mixer.
1118
+
1119
+ For more details on Metaformer structure, please refer to:
1120
+ `MetaFormer Is Actually What You Need for Vision <https://arxiv.org/pdf/2111.11418.pdf>`_
1121
+ """
1122
+
1123
+ def __init__(
1124
+ self,
1125
+ dim: int,
1126
+ mlp_ratio: float = 4.0,
1127
+ act_layer: nn.Module = nn.GELU,
1128
+ norm_layer: nn.Module = nn.BatchNorm2d,
1129
+ drop: float = 0.0,
1130
+ drop_path: float = 0.0,
1131
+ use_layer_scale: bool = True,
1132
+ layer_scale_init_value: float = 1e-5,
1133
+ ):
1134
+ """Build Attention Block.
1135
+
1136
+ Args:
1137
+ dim: Number of embedding dimensions.
1138
+ mlp_ratio: MLP expansion ratio. Default: 4.0
1139
+ act_layer: Activation layer. Default: ``nn.GELU``
1140
+ norm_layer: Normalization layer. Default: ``nn.BatchNorm2d``
1141
+ drop: Dropout rate. Default: 0.0
1142
+ drop_path: Drop path rate. Default: 0.0
1143
+ use_layer_scale: Flag to turn on layer scale. Default: ``True``
1144
+ layer_scale_init_value: Layer scale value at initialization. Default: 1e-5
1145
+ """
1146
+
1147
+ super().__init__()
1148
+
1149
+ # Fallback, sometimes batchnorm tensors
1150
+ # do not get instantiated correctly on some processes
1151
+ # when using deepspeed + accelerate
1152
+ norm_layer_ = norm_layer(num_features=dim)
1153
+ if norm_layer_.weight.shape[0] == 0:
1154
+ norm_layer_.weight = nn.Parameter(torch.zeros(dim))
1155
+ if norm_layer_.bias.shape[0] == 0:
1156
+ norm_layer_.bias = nn.Parameter(torch.zeros(dim))
1157
+
1158
+ self.norm = norm_layer_
1159
+ self.token_mixer = MHSA(dim=dim)
1160
+
1161
+ assert mlp_ratio > 0, "MLP ratio should be greater than 0, found: {}".format(
1162
+ mlp_ratio
1163
+ )
1164
+ mlp_hidden_dim = int(dim * mlp_ratio)
1165
+ self.convffn = ConvFFN(
1166
+ in_channels=dim,
1167
+ hidden_channels=mlp_hidden_dim,
1168
+ act_layer=act_layer,
1169
+ drop=drop,
1170
+ )
1171
+
1172
+ # Drop path
1173
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
1174
+
1175
+ # Layer Scale
1176
+ self.use_layer_scale = use_layer_scale
1177
+ if use_layer_scale:
1178
+ self.layer_scale_1 = nn.Parameter(
1179
+ layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True
1180
+ )
1181
+ self.layer_scale_2 = nn.Parameter(
1182
+ layer_scale_init_value * torch.ones((dim, 1, 1)), requires_grad=True
1183
+ )
1184
+
1185
+ def forward(self, x):
1186
+ if self.use_layer_scale:
1187
+ x = x + self.drop_path(self.layer_scale_1 * self.token_mixer(self.norm(x)))
1188
+ x = x + self.drop_path(self.layer_scale_2 * self.convffn(x))
1189
+ else:
1190
+ x = x + self.drop_path(self.token_mixer(self.norm(x)))
1191
+ x = x + self.drop_path(self.convffn(x))
1192
+ return x
1193
+
1194
+
1195
+ def basic_blocks(
1196
+ dim: int,
1197
+ block_index: int,
1198
+ num_blocks: List[int],
1199
+ token_mixer_type: str,
1200
+ kernel_size: int = 3,
1201
+ mlp_ratio: float = 4.0,
1202
+ act_layer: nn.Module = nn.GELU,
1203
+ norm_layer: nn.Module = nn.BatchNorm2d,
1204
+ drop_rate: float = 0.0,
1205
+ drop_path_rate: float = 0.0,
1206
+ use_layer_scale: bool = True,
1207
+ layer_scale_init_value: float = 1e-5,
1208
+ inference_mode=False,
1209
+ ) -> nn.Sequential:
1210
+ """Build FastViT blocks within a stage.
1211
+
1212
+ Args:
1213
+ dim: Number of embedding dimensions.
1214
+ block_index: block index.
1215
+ num_blocks: List containing number of blocks per stage.
1216
+ token_mixer_type: Token mixer type.
1217
+ kernel_size: Kernel size for repmixer.
1218
+ mlp_ratio: MLP expansion ratio.
1219
+ act_layer: Activation layer.
1220
+ norm_layer: Normalization layer.
1221
+ drop_rate: Dropout rate.
1222
+ drop_path_rate: Drop path rate.
1223
+ use_layer_scale: Flag to turn on layer scale regularization.
1224
+ layer_scale_init_value: Layer scale value at initialization.
1225
+ inference_mode: Flag to instantiate block in inference mode.
1226
+
1227
+ Returns:
1228
+ nn.Sequential object of all the blocks within the stage.
1229
+ """
1230
+ blocks = []
1231
+ for block_idx in range(num_blocks[block_index]):
1232
+ block_dpr = (
1233
+ drop_path_rate
1234
+ * (block_idx + sum(num_blocks[:block_index]))
1235
+ / (sum(num_blocks) - 1)
1236
+ )
1237
+ if token_mixer_type == "repmixer":
1238
+ blocks.append(
1239
+ RepMixerBlock(
1240
+ dim,
1241
+ kernel_size=kernel_size,
1242
+ mlp_ratio=mlp_ratio,
1243
+ act_layer=act_layer,
1244
+ drop=drop_rate,
1245
+ drop_path=block_dpr,
1246
+ use_layer_scale=use_layer_scale,
1247
+ layer_scale_init_value=layer_scale_init_value,
1248
+ inference_mode=inference_mode,
1249
+ )
1250
+ )
1251
+ elif token_mixer_type == "attention":
1252
+ blocks.append(
1253
+ AttentionBlock(
1254
+ dim,
1255
+ mlp_ratio=mlp_ratio,
1256
+ act_layer=act_layer,
1257
+ norm_layer=norm_layer,
1258
+ drop=drop_rate,
1259
+ drop_path=block_dpr,
1260
+ use_layer_scale=use_layer_scale,
1261
+ layer_scale_init_value=layer_scale_init_value,
1262
+ )
1263
+ )
1264
+ else:
1265
+ raise ValueError(
1266
+ "Token mixer type: {} not supported".format(token_mixer_type)
1267
+ )
1268
+ blocks = nn.Sequential(*blocks)
1269
+ return blocks
1270
+
1271
+
1272
+ class GlobalPool2D(nn.Module):
1273
+ """This class implements global pooling with linear projection."""
1274
+
1275
+ def __init__(self, in_dim: int, out_dim: int, *args, **kwargs) -> None:
1276
+ super().__init__()
1277
+ scale = in_dim**-0.5
1278
+ self.proj = nn.Parameter(scale * torch.randn(size=(in_dim, out_dim)))
1279
+ self.in_dim = in_dim
1280
+ self.out_dim = out_dim
1281
+
1282
+ def pool(self, x) -> Tensor:
1283
+ if x.dim() == 4:
1284
+ dims = [-2, -1]
1285
+ elif x.dim() == 5:
1286
+ dims = [-3, -2, -1]
1287
+ x = torch.mean(x, dim=dims, keepdim=False)
1288
+ return x
1289
+
1290
+ def forward(self, x: Tensor, *args, **kwargs) -> Tensor:
1291
+ # x is of shape [batch, in_dim]
1292
+ assert (
1293
+ x.dim() == 4
1294
+ ), "Input should be 4-dimensional (Batch x in_dim x in_height x in_width). Got: {}".format(
1295
+ x.shape
1296
+ )
1297
+
1298
+ # [batch, in_dim, in_height, in_width] --> [batch, in_dim]
1299
+ x = self.pool(x)
1300
+ # [batch, in_dim] x [in_dim, out_dim] --> [batch, out_dim]
1301
+ x = x @ self.proj
1302
+ return x
1303
+
1304
+
1305
+ class FastViT(nn.Module):
1306
+ """
1307
+ This class implements `FastViT architecture <https://arxiv.org/pdf/2303.14189.pdf>`_
1308
+ """
1309
+
1310
+ def __init__(
1311
+ self,
1312
+ layers,
1313
+ token_mixers: Tuple[str, ...],
1314
+ embed_dims=None,
1315
+ mlp_ratios=None,
1316
+ downsamples=None,
1317
+ se_downsamples=None,
1318
+ repmixer_kernel_size=3,
1319
+ norm_layer: nn.Module = nn.BatchNorm2d,
1320
+ act_layer: nn.Module = nn.GELU,
1321
+ num_classes=1000,
1322
+ pos_embs=None,
1323
+ down_patch_size=7,
1324
+ down_stride=2,
1325
+ drop_rate=0.0,
1326
+ drop_path_rate=0.0,
1327
+ use_layer_scale=True,
1328
+ layer_scale_init_value=1e-5,
1329
+ init_cfg=None,
1330
+ pretrained=None,
1331
+ cls_ratio=2.0,
1332
+ inference_mode=False,
1333
+ stem_scale_branch=True,
1334
+ **kwargs,
1335
+ ) -> None:
1336
+
1337
+ super().__init__()
1338
+
1339
+ self.num_classes = num_classes
1340
+ if len(layers) == 4:
1341
+ self.out_indices = [0, 2, 4, 7]
1342
+ elif len(layers) == 5:
1343
+ self.out_indices = [0, 2, 4, 7, 10]
1344
+ else:
1345
+ raise NotImplementedError("FPN is not implemented for more than 5 stages.")
1346
+
1347
+ if pos_embs is None:
1348
+ pos_embs = [None] * len(layers)
1349
+
1350
+ if se_downsamples is None:
1351
+ se_downsamples = [False] * len(layers)
1352
+
1353
+ # Convolutional stem
1354
+ self.patch_embed = convolutional_stem(3, embed_dims[0], inference_mode,
1355
+ use_scale_branch=stem_scale_branch)
1356
+
1357
+ # Build the main stages of the network architecture
1358
+ network = []
1359
+ for i in range(len(layers)):
1360
+ # Add position embeddings if requested
1361
+ if pos_embs[i] is not None:
1362
+ network.append(
1363
+ pos_embs[i](
1364
+ embed_dims[i], embed_dims[i], inference_mode=inference_mode
1365
+ )
1366
+ )
1367
+ stage = basic_blocks(
1368
+ embed_dims[i],
1369
+ i,
1370
+ layers,
1371
+ token_mixer_type=token_mixers[i],
1372
+ kernel_size=repmixer_kernel_size,
1373
+ mlp_ratio=mlp_ratios[i],
1374
+ act_layer=act_layer,
1375
+ norm_layer=norm_layer,
1376
+ drop_rate=drop_rate,
1377
+ drop_path_rate=drop_path_rate,
1378
+ use_layer_scale=use_layer_scale,
1379
+ layer_scale_init_value=layer_scale_init_value,
1380
+ inference_mode=inference_mode,
1381
+ )
1382
+ network.append(stage)
1383
+ if i >= len(layers) - 1:
1384
+ break
1385
+
1386
+ # Patch merging/downsampling between stages.
1387
+ if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
1388
+ network.append(
1389
+ PatchEmbed(
1390
+ patch_size=down_patch_size,
1391
+ stride=down_stride,
1392
+ in_channels=embed_dims[i],
1393
+ embed_dim=embed_dims[i + 1],
1394
+ inference_mode=inference_mode,
1395
+ use_se=se_downsamples[i + 1],
1396
+ )
1397
+ )
1398
+ self.network = nn.ModuleList(network)
1399
+
1400
+ # Classifier head
1401
+ self.conv_exp = MobileOneBlock(
1402
+ in_channels=embed_dims[-1],
1403
+ out_channels=int(embed_dims[-1] * cls_ratio),
1404
+ kernel_size=3,
1405
+ stride=1,
1406
+ padding=1,
1407
+ groups=embed_dims[-1],
1408
+ inference_mode=inference_mode,
1409
+ use_se=True,
1410
+ num_conv_branches=1,
1411
+ )
1412
+ self.head = (
1413
+ nn.Linear(int(embed_dims[-1] * cls_ratio), num_classes)
1414
+ if num_classes > 0
1415
+ else nn.Identity()
1416
+ )
1417
+ self.apply(self.cls_init_weights)
1418
+ self.init_cfg = copy.deepcopy(init_cfg)
1419
+
1420
+ def cls_init_weights(self, m: nn.Module) -> None:
1421
+ """Init. for classification"""
1422
+ if isinstance(m, nn.Linear):
1423
+ normal_(m.weight, std=0.02)
1424
+ if isinstance(m, nn.Linear) and m.bias is not None:
1425
+ nn.init.constant_(m.bias, 0)
1426
+
1427
+ def forward_embeddings(self, x: torch.Tensor) -> torch.Tensor:
1428
+ x = self.patch_embed(x)
1429
+ return x
1430
+
1431
+ def forward_tokens(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
1432
+ for idx, block in enumerate(self.network):
1433
+ x = block(x)
1434
+ return x
1435
+
1436
+ def forward(self, x: torch.Tensor, *args, **kwargs) -> Union[Tensor, Dict[str, Tensor]]:
1437
+ # input embedding
1438
+ x = self.forward_embeddings(x)
1439
+ # through backbone
1440
+ x = self.forward_tokens(x)
1441
+ # for image classification/embedding
1442
+ x = self.conv_exp(x)
1443
+ cls_out = self.head(x)
1444
+
1445
+ out_dict = dict()
1446
+ if kwargs.get("return_image_embeddings", False):
1447
+ out_dict.update({"logits": cls_out})
1448
+ out_dict.update({"image_embeddings": x})
1449
+ return out_dict
1450
+ else:
1451
+ return cls_out
1452
+
1453
+
1454
+ @register_model
1455
+ def fastvithd(pretrained=False, **kwargs):
1456
+ """Instantiate FastViTHD model variant."""
1457
+ layers = [2, 12, 24, 4, 2]
1458
+ embed_dims = [96, 192, 384, 768, 1536]
1459
+ mlp_ratios = [4, 4, 4, 4, 4]
1460
+ downsamples = [True, True, True, True, True]
1461
+ pos_embs = [None, None, None, partial(RepCPE, spatial_shape=(7, 7)), partial(RepCPE, spatial_shape=(7, 7))]
1462
+ token_mixers = ("repmixer", "repmixer", "repmixer", "attention", "attention")
1463
+ model = FastViT(
1464
+ layers,
1465
+ token_mixers=token_mixers,
1466
+ embed_dims=embed_dims,
1467
+ pos_embs=pos_embs,
1468
+ mlp_ratios=mlp_ratios,
1469
+ downsamples=downsamples,
1470
+ norm_layer=LayerNormChannel,
1471
+ stem_scale_branch=False,
1472
+ inference_mode=True,
1473
+ **kwargs,
1474
+ )
1475
+ model.default_cfg = default_cfgs["fastvit_m"]
1476
+ if pretrained:
1477
+ raise ValueError("Functionality not implemented.")
1478
+ return model
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b17eb184e6d9be913c405a8bbcccc5baf7a2462bb3ec4d850e02b3a7ed5391a
3
+ size 250290912
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 1024,
4
+ "width": 1024
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.0,
13
+ 0.0,
14
+ 0.0
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 1024
26
+ }
27
+ }