Ding Yiwei DingYiwei glenn-jocher commited on
Commit
1148e2e
1 Parent(s): b8b8629

Add TransformerLayer, TransformerBlock, C3TR modules (#2333)

Browse files

* yolotr

* transformer block

* Remove bias in Transformer

* Remove C3T

* Remove a deprecated class

* put the 2nd LayerNorm into the 2nd residual block

* move example model to models/hub, rename to -transformer

* Add module comments and TODOs

* Remove LN in Transformer

* Add comments for Transformer

* Solve the problem of MA with DDP

* cleanup

* cleanup find_unused_parameters

* PEP8 reformat

Co-authored-by: DingYiwei <846414640@qq.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>

models/common.py CHANGED
@@ -43,6 +43,52 @@ class Conv(nn.Module):
43
  return self.act(self.conv(x))
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  class Bottleneck(nn.Module):
47
  # Standard bottleneck
48
  def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
@@ -90,6 +136,14 @@ class C3(nn.Module):
90
  return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
91
 
92
 
 
 
 
 
 
 
 
 
93
  class SPP(nn.Module):
94
  # Spatial pyramid pooling layer used in YOLOv3-SPP
95
  def __init__(self, c1, c2, k=(5, 9, 13)):
 
43
  return self.act(self.conv(x))
44
 
45
 
46
+ class TransformerLayer(nn.Module):
47
+ # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
48
+ def __init__(self, c, num_heads):
49
+ super().__init__()
50
+ self.q = nn.Linear(c, c, bias=False)
51
+ self.k = nn.Linear(c, c, bias=False)
52
+ self.v = nn.Linear(c, c, bias=False)
53
+ self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
54
+ self.fc1 = nn.Linear(c, c, bias=False)
55
+ self.fc2 = nn.Linear(c, c, bias=False)
56
+
57
+ def forward(self, x):
58
+ x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
59
+ x = self.fc2(self.fc1(x)) + x
60
+ return x
61
+
62
+
63
+ class TransformerBlock(nn.Module):
64
+ # Vision Transformer https://arxiv.org/abs/2010.11929
65
+ def __init__(self, c1, c2, num_heads, num_layers):
66
+ super().__init__()
67
+ self.conv = None
68
+ if c1 != c2:
69
+ self.conv = Conv(c1, c2)
70
+ self.linear = nn.Linear(c2, c2) # learnable position embedding
71
+ self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)])
72
+ self.c2 = c2
73
+
74
+ def forward(self, x):
75
+ if self.conv is not None:
76
+ x = self.conv(x)
77
+ b, _, w, h = x.shape
78
+ p = x.flatten(2)
79
+ p = p.unsqueeze(0)
80
+ p = p.transpose(0, 3)
81
+ p = p.squeeze(3)
82
+ e = self.linear(p)
83
+ x = p + e
84
+
85
+ x = self.tr(x)
86
+ x = x.unsqueeze(3)
87
+ x = x.transpose(0, 3)
88
+ x = x.reshape(b, self.c2, w, h)
89
+ return x
90
+
91
+
92
  class Bottleneck(nn.Module):
93
  # Standard bottleneck
94
  def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
 
136
  return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
137
 
138
 
139
+ class C3TR(C3):
140
+ # C3 module with TransformerBlock()
141
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
142
+ super().__init__(c1, c2, n, shortcut, g, e)
143
+ c_ = int(c2 * e)
144
+ self.m = TransformerBlock(c_, c_, 4, n)
145
+
146
+
147
  class SPP(nn.Module):
148
  # Spatial pyramid pooling layer used in YOLOv3-SPP
149
  def __init__(self, c1, c2, k=(5, 9, 13)):
models/hub/yolov5s-transformer.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parameters
2
+ nc: 80 # number of classes
3
+ depth_multiple: 0.33 # model depth multiple
4
+ width_multiple: 0.50 # layer channel multiple
5
+
6
+ # anchors
7
+ anchors:
8
+ - [10,13, 16,30, 33,23] # P3/8
9
+ - [30,61, 62,45, 59,119] # P4/16
10
+ - [116,90, 156,198, 373,326] # P5/32
11
+
12
+ # YOLOv5 backbone
13
+ backbone:
14
+ # [from, number, module, args]
15
+ [[-1, 1, Focus, [64, 3]], # 0-P1/2
16
+ [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
17
+ [-1, 3, C3, [128]],
18
+ [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
19
+ [-1, 9, C3, [256]],
20
+ [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
21
+ [-1, 9, C3, [512]],
22
+ [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
23
+ [-1, 1, SPP, [1024, [5, 9, 13]]],
24
+ [-1, 3, C3TR, [1024, False]], # 9 <-------- C3TR() Transformer module
25
+ ]
26
+
27
+ # YOLOv5 head
28
+ head:
29
+ [[-1, 1, Conv, [512, 1, 1]],
30
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31
+ [[-1, 6], 1, Concat, [1]], # cat backbone P4
32
+ [-1, 3, C3, [512, False]], # 13
33
+
34
+ [-1, 1, Conv, [256, 1, 1]],
35
+ [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36
+ [[-1, 4], 1, Concat, [1]], # cat backbone P3
37
+ [-1, 3, C3, [256, False]], # 17 (P3/8-small)
38
+
39
+ [-1, 1, Conv, [256, 3, 2]],
40
+ [[-1, 14], 1, Concat, [1]], # cat head P4
41
+ [-1, 3, C3, [512, False]], # 20 (P4/16-medium)
42
+
43
+ [-1, 1, Conv, [512, 3, 2]],
44
+ [[-1, 10], 1, Concat, [1]], # cat head P5
45
+ [-1, 3, C3, [1024, False]], # 23 (P5/32-large)
46
+
47
+ [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
48
+ ]
models/yolo.py CHANGED
@@ -215,13 +215,13 @@ def parse_model(d, ch): # model_dict, input_channels(3)
215
 
216
  n = max(round(n * gd), 1) if n > 1 else n # depth gain
217
  if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
218
- C3]:
219
  c1, c2 = ch[f], args[0]
220
  if c2 != no: # if not output
221
  c2 = make_divisible(c2 * gw, 8)
222
 
223
  args = [c1, c2, *args[1:]]
224
- if m in [BottleneckCSP, C3]:
225
  args.insert(2, n) # number of repeats
226
  n = 1
227
  elif m is nn.BatchNorm2d:
 
215
 
216
  n = max(round(n * gd), 1) if n > 1 else n # depth gain
217
  if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
218
+ C3, C3TR]:
219
  c1, c2 = ch[f], args[0]
220
  if c2 != no: # if not output
221
  c2 = make_divisible(c2 * gw, 8)
222
 
223
  args = [c1, c2, *args[1:]]
224
+ if m in [BottleneckCSP, C3, C3TR]:
225
  args.insert(2, n) # number of repeats
226
  n = 1
227
  elif m is nn.BatchNorm2d:
train.py CHANGED
@@ -218,7 +218,9 @@ def train(hyp, opt, device, tb_writer=None):
218
 
219
  # DDP mode
220
  if cuda and rank != -1:
221
- model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
 
 
222
 
223
  # Model parameters
224
  hyp['box'] *= 3. / nl # scale to layers
 
218
 
219
  # DDP mode
220
  if cuda and rank != -1:
221
+ model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank,
222
+ # nn.MultiheadAttention incompatibility with DDP https://github.com/pytorch/pytorch/issues/26698
223
+ find_unused_parameters=any(isinstance(layer, nn.MultiheadAttention) for layer in model.modules()))
224
 
225
  # Model parameters
226
  hyp['box'] *= 3. / nl # scale to layers