-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels_vit.py
More file actions
71 lines (56 loc) · 2.5 KB
/
models_vit.py
File metadata and controls
71 lines (56 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
from functools import partial
import torch
import torch.nn as nn
import timm.models.vision_transformer
class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
""" Vision Transformer with support for global average pooling
"""
def __init__(self, global_pool, tanh=False, head_layers=1, **kwargs):
super(VisionTransformer, self).__init__(**kwargs)
self.global_pool = global_pool
self.tanh = tanh
num_classes = kwargs['num_classes']
layers = []
for i in range(head_layers - 1):
layers.extend([nn.Linear(self.embed_dim, self.embed_dim), nn.ReLU()])
layers.append(nn.Linear(self.embed_dim, num_classes))
self.head = nn.Sequential(*layers) if head_layers > 1 else nn.Linear(self.embed_dim, num_classes)
def freeze_encoder(self, num_blocks=None):
if num_blocks is None:
for param in self.blocks.parameters():
param.requires_grad = False
else:
for param in self.blocks[:num_blocks].parameters():
param.requires_grad = False
for param in self.patch_embed.proj.parameters():
param.requires_grad = False
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.forward_features(x)
x = self.forward_head(x)
if self.tanh:
return torch.tanh(x)
return x
def vit_small_patch16(**kwargs):
model = VisionTransformer(
patch_size=16, embed_dim=512, depth=12, num_heads=8, mlp_ratio=4, qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
return model
def vit_medium_patch16(**kwargs):
model = VisionTransformer(
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
return model
def vit_large_patch16(**kwargs):
model = VisionTransformer(
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
return model