hbfreed commited on
Commit
010a8b2
1 Parent(s): 6d5c70d

added more necessary stuff

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ demo_files/downloaded_videos/Ball_demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ demo_files/downloaded_videos/demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ demo_files/downloaded_videos/Strike_demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ picklebot_2m.csv filter=lfs diff=lfs merge=lfs -text
demo_files/ball_high.mp4 ADDED
Binary file (274 kB). View file
 
demo_files/ball_in_dirt.mp4 ADDED
Binary file (214 kB). View file
 
demo_files/ball_outside.mp4 ADDED
Binary file (290 kB). View file
 
demo_files/downloaded_videos/Ball_cropped_video.mp4 ADDED
Binary file (269 kB). View file
 
demo_files/downloaded_videos/Ball_demo_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f49ba1fa3f64f7abd4555c372833cf5b594806772516acf511a59b254c175372
3
+ size 7116147
demo_files/downloaded_videos/Strike_cropped_video.mp4 ADDED
Binary file (326 kB). View file
 
demo_files/downloaded_videos/Strike_demo_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8021464006fa91f0f529d0044075ab63a6fb251bbd849dcf25f925d16344a92
3
+ size 7753041
demo_files/downloaded_videos/cropped_video.mp4 ADDED
Binary file (212 kB). View file
 
demo_files/downloaded_videos/demo_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:411caea9714f16d6b302d6ad5f4565358acc706490134c0963c09f7828315572
3
+ size 5934043
demo_files/strike_high.mp4 ADDED
Binary file (126 kB). View file
 
demo_files/strike_middle.mp4 ADDED
Binary file (298 kB). View file
 
demo_files/strike_outside_corner.mp4 ADDED
Binary file (129 kB). View file
 
mobilenet.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Implementing Mobilenet v3 as seen in
3
+ "Searching for MobileNetV3" for video classification,
4
+ note that balls are 0 and strikes are 1.
5
+ '''
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.init as init
9
+ import torch.nn.functional as F
10
+
11
+ class SEBlock3D(nn.Module):
12
+ def __init__(self,channels):
13
+ super().__init__()
14
+
15
+ self.se = nn.Sequential(
16
+ nn.AdaptiveAvgPool3d(1),
17
+ nn.Conv3d(channels,channels//4,kernel_size=1),
18
+ nn.ReLU(inplace=True),
19
+ nn.Conv3d(channels//4,channels,kernel_size=1),
20
+ nn.Hardsigmoid()
21
+ )
22
+
23
+ def forward(self,x):
24
+ w = self.se(x)
25
+ x = x * w
26
+ return x
27
+
28
+
29
+ class SEBlock2D(nn.Module):
30
+ def __init__(self,channels):
31
+ super().__init__()
32
+
33
+ self.se = nn.Sequential(
34
+ nn.AdaptiveAvgPool2d(1),
35
+ nn.Conv2d(channels,channels//4,kernel_size=1),
36
+ nn.ReLU(inplace=True),
37
+ nn.Conv2d(channels//4,channels,kernel_size=1),
38
+ nn.Hardsigmoid()
39
+ )
40
+
41
+ def forward(self,x):
42
+ w = self.se(x)
43
+ x = x * w
44
+ return x
45
+
46
+ #Bottleneck for Mobilenets
47
+ class Bottleneck3D(nn.Module):
48
+ def __init__(self, in_channels, out_channels, expanded_channels, stride=1, use_se=False, kernel_size=3,nonlinearity=nn.Hardswish(),batchnorm=True,dropout=0,bias=False):
49
+ super().__init__()
50
+
51
+ #pointwise conv1x1x1 (reduce channels)
52
+ self.pointwise_conv1 = nn.Conv3d(in_channels,expanded_channels,kernel_size=1,bias=bias)
53
+ #depthwise (spatial filtering)
54
+ #groups to preserve channel-wise information
55
+ self.depthwise_conv = nn.Conv3d(
56
+ expanded_channels,#in channels
57
+ expanded_channels,#out channels
58
+ groups=expanded_channels,
59
+ kernel_size=(1,kernel_size,kernel_size),
60
+ stride=stride,
61
+ padding=kernel_size//2,
62
+ bias=bias
63
+ )
64
+ #squeeze-and-excite (recalibrate channel wise)
65
+ self.squeeze_excite = SEBlock3D(expanded_channels) if use_se else None
66
+ #pointwise conv1x1x1 (expansion to increase channels)
67
+ self.pointwise_conv2 = nn.Conv3d(expanded_channels,out_channels,kernel_size=1,bias=bias)
68
+ self.batchnorm = nn.BatchNorm3d(out_channels) if batchnorm else None
69
+ self.nonlinearity = nonlinearity
70
+ self.dropout = nn.Dropout3d(p=dropout)
71
+
72
+ def forward(self,x):
73
+ x = self.pointwise_conv1(x)
74
+ x = self.depthwise_conv(x)
75
+ if self.squeeze_excite is not None:
76
+ x = self.squeeze_excite(x)
77
+ x = self.pointwise_conv2(x)
78
+ x = self.batchnorm(x)
79
+ x = self.nonlinearity(x)
80
+ x = self.dropout(x)
81
+ return x
82
+
83
+
84
+ #2D bottleneck for our 2d convnet with LSTM
85
+ class Bottleneck2D(nn.Module):
86
+ def __init__(self, in_channels, out_channels, expanded_channels, stride=1, use_se=False, kernel_size=3,nonlinearity=nn.Hardswish(),batchnorm=True,dropout=0,bias=False):
87
+ super().__init__()
88
+
89
+ #pointwise conv1x1x1 (reduce channels)
90
+ self.pointwise_conv1 = nn.Conv2d(in_channels,expanded_channels,kernel_size=1,bias=bias)
91
+ #depthwise (spatial filtering)
92
+ #groups to preserve channel-wise information
93
+ self.depthwise_conv = nn.Conv2d(
94
+ expanded_channels,#in channels
95
+ expanded_channels,#out channels
96
+ groups=expanded_channels,
97
+ kernel_size=kernel_size,
98
+ stride=stride,
99
+ padding=kernel_size//2,
100
+ bias=bias
101
+ )
102
+ #squeeze-and-excite (recalibrate channel wise)
103
+ self.squeeze_excite = SEBlock2D(expanded_channels) if use_se else None
104
+ #pointwise conv1x1x1 (expansion to increase channels)
105
+ self.pointwise_conv2 = nn.Conv2d(expanded_channels,out_channels,kernel_size=1,bias=bias)
106
+ self.batchnorm = nn.BatchNorm2d(out_channels) if batchnorm else None
107
+ self.nonlinearity = nonlinearity
108
+ self.dropout = nn.Dropout2d(p=dropout)
109
+
110
+ def forward(self,x):
111
+ x = self.pointwise_conv1(x)
112
+ x = self.depthwise_conv(x)
113
+ if self.squeeze_excite is not None:
114
+ x = self.squeeze_excite(x)
115
+ x = self.pointwise_conv2(x)
116
+ x = self.batchnorm(x)
117
+ x = self.nonlinearity(x)
118
+ return x
119
+
120
+ #mobilenet large 3d convolutions
121
+ class MobileNetLarge3D(nn.Module):
122
+ def __init__(self,num_classes=2):
123
+ super().__init__()
124
+
125
+ self.num_classes = num_classes
126
+
127
+ #conv3d (h-swish): 224x224x3 -> 112x112x16
128
+ self.block1 = nn.Sequential(
129
+ nn.Conv3d(in_channels=3,out_channels=16,stride=2,kernel_size=3,padding=1),
130
+ nn.BatchNorm3d(16),
131
+ nn.Hardswish()
132
+ )
133
+
134
+ #3x3 bottlenecks1 (3, ReLU): 112x112x16 -> 56x56x24
135
+ self.block2 = nn.Sequential(
136
+ Bottleneck3D(in_channels=16,out_channels=16,expanded_channels=16,stride=1,nonlinearity=nn.ReLU(),dropout=0.2),
137
+ Bottleneck3D(in_channels=16,out_channels=24,expanded_channels=64,stride=2,nonlinearity=nn.ReLU(),dropout=0.2),
138
+ Bottleneck3D(in_channels=24,out_channels=24,expanded_channels=72,stride=1,nonlinearity=nn.ReLU(),dropout=0.2)
139
+ )
140
+
141
+ #5x5 bottlenecks1 (3, ReLU, squeeze-excite): 56x56x24 -> 28x28x40
142
+ self.block3 = nn.Sequential(
143
+ Bottleneck3D(in_channels=24,out_channels=40,expanded_channels=72,stride=2,use_se=True,kernel_size=5,nonlinearity=nn.ReLU(),dropout=0.2),
144
+ Bottleneck3D(in_channels=40,out_channels=40,expanded_channels=120,stride=1,use_se=True,kernel_size=5,nonlinearity=nn.ReLU(),dropout=0.2),
145
+ Bottleneck3D(in_channels=40,out_channels=40,expanded_channels=120,stride=1,use_se=True,kernel_size=5,nonlinearity=nn.ReLU(),dropout=0.2)
146
+ )
147
+
148
+ #3x3 bottlenecks2 (6, h-swish, last two get squeeze-excite): 28x28x40 -> 14x14x112
149
+ self.block4 = nn.Sequential(
150
+ Bottleneck3D(in_channels=40,out_channels=80,expanded_channels=240,stride=2,dropout=0.2),
151
+ Bottleneck3D(in_channels=80,out_channels=80,expanded_channels=240,stride=1,dropout=0.2),
152
+ Bottleneck3D(in_channels=80,out_channels=80,expanded_channels=184,stride=1,dropout=0.2),
153
+ Bottleneck3D(in_channels=80,out_channels=80,expanded_channels=184,stride=1,dropout=0.2),
154
+ Bottleneck3D(in_channels=80,out_channels=112,expanded_channels=480,stride=1,use_se=True,dropout=0.2),
155
+ Bottleneck3D(in_channels=112,out_channels=112,expanded_channels=672,stride=1,use_se=True,dropout=0.2)
156
+ )
157
+
158
+ #5x5 bottlenecks2 (3, h-swish, squeeze-excite): 14x14x112 -> 7x7x160
159
+ self.block5 = nn.Sequential(
160
+ Bottleneck3D(in_channels=112,out_channels=160,expanded_channels=672,stride=2,use_se=True,kernel_size=5,dropout=0.2),
161
+ Bottleneck3D(in_channels=160,out_channels=160,expanded_channels=960,stride=1,use_se=True,kernel_size=5,dropout=0.2),
162
+ Bottleneck3D(in_channels=160,out_channels=160,expanded_channels=960,stride=1,use_se=True,kernel_size=5,dropout=0.2)
163
+ )
164
+
165
+ #conv3d (h-swish), avg pool 7x7: 7x7x960 -> 1x1x960
166
+ self.block6 = nn.Sequential(
167
+ nn.Conv3d(in_channels=160,out_channels=960,stride=1,kernel_size=1),
168
+ nn.BatchNorm3d(960),
169
+ nn.Hardswish()
170
+ )
171
+
172
+ #classifier: conv3d 1x1 NBN (2, first uses h-swish): 1x1x960
173
+ self.classifier = nn.Sequential(
174
+ nn.AdaptiveAvgPool3d((1,1,1)),
175
+ nn.Conv3d(in_channels=960,out_channels=1280,kernel_size=1,stride=1,padding=0), #2 classes for ball/strike
176
+ nn.Hardswish(),
177
+ nn.Conv3d(in_channels=1280,out_channels=self.num_classes,kernel_size=1,stride=1,padding=0)
178
+ )
179
+
180
+ def forward(self,x):
181
+ x = self.block1(x)
182
+ x = self.block2(x)
183
+ x = self.block3(x)
184
+ x = self.block4(x)
185
+ x = self.block5(x)
186
+ x = self.block6(x)
187
+ x = self.classifier(x)
188
+ x = x.view(x.shape[0], self.num_classes)
189
+ return x
190
+
191
+ def initialize_weights(self):
192
+ for module in self.modules():
193
+ if isinstance(module, nn.Conv3d) or isinstance(module, nn.Linear):
194
+ if hasattr(module, "nonlinearity"):
195
+ if module.nonlinearity == 'relu':
196
+ init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
197
+ elif module.nonlinearity == 'hardswish':
198
+ init.xavier_uniform_(module.weight)
199
+ elif isinstance(module, nn.BatchNorm3d):
200
+ init.constant_(module.weight, 1)
201
+ init.constant_(module.bias, 0)
202
+
203
+ #mobilenet small 3d convolutions
204
+ class MobileNetSmall3D(nn.Module):
205
+ def __init__(self,num_classes=2):
206
+ super().__init__()
207
+
208
+ self.num_classes = num_classes
209
+
210
+ #conv3d (h-swish): 224x224x3 -> 112x112x16
211
+ self.block1 = nn.Sequential(
212
+ nn.Conv3d(in_channels=3,out_channels=16,kernel_size=3,stride=2,padding=1),
213
+ nn.BatchNorm3d(16),
214
+ nn.Hardswish()
215
+ )
216
+
217
+ #3x3 bottlenecks (3, ReLU, first gets squeeze-excite): 112x112x16 -> 28x28x24
218
+ self.block2 = nn.Sequential(
219
+ Bottleneck3D(in_channels=16,out_channels=16,expanded_channels=16,stride=2,use_se=True,nonlinearity=nn.LeakyReLU(),dropout=0.2),
220
+ Bottleneck3D(in_channels=16,out_channels=24,expanded_channels=72,stride=2,nonlinearity=nn.LeakyReLU(),dropout=0.2),
221
+ Bottleneck3D(in_channels=24,out_channels=24,expanded_channels=88,stride=1,nonlinearity=nn.LeakyReLU(),dropout=0.2)
222
+ )
223
+ #5x5 bottlenecks (8, h-swish, squeeze-excite): 28x28x24 -> 7x7x96
224
+ self.block3 = nn.Sequential(
225
+ Bottleneck3D(in_channels=24,out_channels=40,expanded_channels=96,stride=2,use_se=True,kernel_size=5,dropout=0.2),
226
+ Bottleneck3D(in_channels=40,out_channels=40,expanded_channels=240,stride=1,use_se=True,kernel_size=5,dropout=0.2),
227
+ Bottleneck3D(in_channels=40,out_channels=40,expanded_channels=240,stride=1,use_se=True,kernel_size=5,dropout=0.2),
228
+ Bottleneck3D(in_channels=40,out_channels=48,expanded_channels=120,stride=1,use_se=True,kernel_size=5,dropout=0.2),
229
+ Bottleneck3D(in_channels=48,out_channels=48,expanded_channels=144,stride=1,use_se=True,kernel_size=5,dropout=0.2),
230
+ Bottleneck3D(in_channels=48,out_channels=96,expanded_channels=288,stride=2,use_se=True,kernel_size=5,dropout=0.2),
231
+ Bottleneck3D(in_channels=96,out_channels=96,expanded_channels=576,stride=1,use_se=True,kernel_size=5,dropout=0.2),
232
+ Bottleneck3D(in_channels=96,out_channels=96,expanded_channels=576,stride=1,use_se=True,kernel_size=5,dropout=0.2)
233
+ )
234
+ #conv3d (h-swish), avg pool 7x7: 7x7x96 -> 1x1x576
235
+ self.block4 = nn.Sequential(
236
+ nn.Conv3d(in_channels=96,out_channels=576,kernel_size=1,stride=1,padding=0),
237
+ SEBlock3D(channels=576),
238
+ nn.BatchNorm3d(576),
239
+ nn.Hardswish()
240
+ )
241
+ #conv3d 1x1, NBN, (2, first uses h-swish): 1x1x576
242
+ self.classifier = nn.Sequential(
243
+ nn.AdaptiveAvgPool3d((1,1,1)),
244
+ nn.Conv3d(in_channels=576,out_channels=1024,kernel_size=1,stride=1,padding=0),
245
+ nn.Hardswish(),
246
+ nn.Conv3d(in_channels=1024,out_channels=self.num_classes,kernel_size=1,stride=1,padding=0),
247
+ )
248
+
249
+ def forward(self,x):
250
+ x = self.block1(x)
251
+ x = self.block2(x)
252
+ x = self.block3(x)
253
+ x = self.block4(x)
254
+ x = self.classifier(x)
255
+ x = x.view(x.shape[0], self.num_classes)
256
+ return x
257
+
258
+
259
+ def initialize_weights(self):
260
+ for module in self.modules():
261
+ if isinstance(module, nn.Conv3d) or isinstance(module, nn.Linear):
262
+ if hasattr(module, "nonlinearity"):
263
+ if module.nonlinearity == 'relu' or 'leaky_relu':
264
+ init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
265
+ elif module.nonlinearity == 'hardswish':
266
+ init.xavier_uniform_(module.weight)
267
+ elif isinstance(module, nn.BatchNorm3d):
268
+ init.constant_(module.weight, 1)
269
+ init.constant_(module.bias, 0)
270
+
271
+
272
+
273
+
274
+
275
+ #MobileNetV3-Large 2D + LSTM for helping with the temporal dimension
276
+ class MobileNetLarge2D(nn.Module):
277
+ def __init__(self, num_classes=2):
278
+ super().__init__()
279
+
280
+ self.num_classes = num_classes
281
+
282
+ def initialize_weights(self):
283
+ for module in self.modules():
284
+ if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
285
+ if hasattr(module, "nonlinearity"):
286
+ if module.nonlinearity == 'relu':
287
+ init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
288
+ elif module.nonlinearity == 'hardswish':
289
+ init.xavier_uniform_(module.weight)
290
+ elif isinstance(module, nn.BatchNorm2d):
291
+ init.constant_(module.weight, 1)
292
+ init.constant_(module.bias, 0)
293
+
294
+ #conv2d (h-swish): 224x224x3 -> 112x112x16
295
+ self.block1 = nn.Sequential(
296
+ nn.Conv2d(in_channels=3,out_channels=16,stride=2,kernel_size=3,padding=1),
297
+ nn.BatchNorm2d(16),
298
+ nn.Hardswish()
299
+ )
300
+ #3x3 bottlenecks1 (3, ReLU): 112x112x16 -> 56x56x24
301
+ self.block2 = nn.Sequential(
302
+ Bottleneck2D(in_channels=16,out_channels=16,expanded_channels=16,stride=1,nonlinearity=nn.ReLU(),dropout=0.2),
303
+ Bottleneck2D(in_channels=16,out_channels=24,expanded_channels=64,stride=2,nonlinearity=nn.ReLU()),
304
+ Bottleneck2D(in_channels=24,out_channels=24,expanded_channels=72,stride=1,nonlinearity=nn.ReLU(),dropout=0.2)
305
+ )
306
+ #5x5 bottlenecks1 (3, ReLU, squeeze-excite): 56x56x24 -> 28x28x40
307
+ self.block3 = nn.Sequential(
308
+ Bottleneck2D(in_channels=24,out_channels=40,expanded_channels=72,stride=2,use_se=True,kernel_size=5,nonlinearity=nn.ReLU(),dropout=0.2),
309
+ Bottleneck2D(in_channels=40,out_channels=40,expanded_channels=120,stride=1,use_se=True,kernel_size=5,nonlinearity=nn.ReLU()),
310
+ Bottleneck2D(in_channels=40,out_channels=40,expanded_channels=120,stride=1,use_se=True,kernel_size=5,nonlinearity=nn.ReLU(),dropout=0.2)
311
+ )
312
+ #3x3 bottlenecks2 (6, h-swish, last two get squeeze-excite): 28x28x40 -> 14x14x112
313
+ self.block4 = nn.Sequential(
314
+ Bottleneck2D(in_channels=40,out_channels=80,expanded_channels=240,stride=2,dropout=0.2),
315
+ Bottleneck2D(in_channels=80,out_channels=80,expanded_channels=240,stride=1),
316
+ Bottleneck2D(in_channels=80,out_channels=80,expanded_channels=184,stride=1,dropout=0.2),
317
+ Bottleneck2D(in_channels=80,out_channels=80,expanded_channels=184,stride=1),
318
+ Bottleneck2D(in_channels=80,out_channels=112,expanded_channels=480,stride=1,use_se=True,dropout=0.2),
319
+ Bottleneck2D(in_channels=112,out_channels=112,expanded_channels=672,stride=1,use_se=True,dropout=0.2)
320
+ )
321
+ #5x5 bottlenecks2 (3, h-swish, squeeze-excite): 14x14x112 -> 7x7x160
322
+ self.block5 = nn.Sequential(
323
+ Bottleneck2D(in_channels=112,out_channels=160,expanded_channels=672,stride=2,use_se=True,kernel_size=5),
324
+ Bottleneck2D(in_channels=160,out_channels=160,expanded_channels=960,stride=1,use_se=True,kernel_size=5),
325
+ Bottleneck2D(in_channels=160,out_channels=160,expanded_channels=960,stride=1,use_se=True,kernel_size=5)
326
+ )
327
+ #conv3d (h-swish), avg pool 7x7: 7x7x960 -> 1x1x960
328
+ self.block6 = nn.Sequential(
329
+ nn.Conv2d(in_channels=160,out_channels=960,stride=1,kernel_size=1),
330
+ nn.BatchNorm2d(960),
331
+ nn.Hardswish(),
332
+ nn.AvgPool2d(kernel_size=7,stride=1)
333
+ )
334
+ #LSTM: 1x1x960 ->
335
+ self.lstm = nn.LSTM(input_size=960,hidden_size=32,num_layers=5,batch_first=True)
336
+ #classifier: conv3d 1x1 NBN (2, first uses h-swish): 1x1x960
337
+ self.classifier = nn.Sequential(
338
+ nn.Linear(32,self.num_classes) #2 classes for ball/strike
339
+ )
340
+
341
+ def forward(self,x):
342
+ #x is shape (batch_size, timesteps, C, H, W)
343
+ batch_size,timesteps,C,H,W = x.size()
344
+ cnn_out = torch.zeros(batch_size,timesteps,960).to(x.device) #assuming the output of block6 is 960
345
+ #we're looping through the frames in the video
346
+ for i in range(timesteps):
347
+ # Select the frame at the ith position
348
+ frame = x[:, i, :, :, :]
349
+ frame = self.block1(frame)
350
+ frame = self.block2(frame)
351
+ frame = self.block3(frame)
352
+ frame = self.block4(frame)
353
+ frame = self.block5(frame)
354
+ frame = self.block6(frame)
355
+ # Flatten the frame (minus the batch dimension)
356
+ frame = frame.view(frame.size(0), -1)
357
+ cnn_out[:, i, :] = frame
358
+ # reshape for LSTM
359
+ x = cnn_out
360
+ x, _ = self.lstm(x)
361
+ # get the output from the last timestep only
362
+ x = x[:, -1, :]
363
+ x = self.classifier(x)
364
+ return x
365
+
366
+
367
+
368
+ #MobileNetV3-Small 2d with lstm for helping with the temporal dimension
369
+ class MobileNetSmall2D(nn.Module):
370
+ def __init__(self,num_classes=2):
371
+ super().__init__()
372
+
373
+ self.num_classes = num_classes
374
+
375
+
376
+ #conv3d (h-swish): 224x224x3 -> 112x112x16
377
+ self.block1 = nn.Sequential(
378
+ nn.Conv2d(in_channels=3,out_channels=16,kernel_size=3,stride=2,padding=1),
379
+ nn.BatchNorm2d(16),
380
+ nn.Hardswish()
381
+ )
382
+ #3x3 bottlenecks (3, ReLU, first gets squeeze-excite): 112x112x16 -> 28x28x24
383
+ self.block2 = nn.Sequential(
384
+ Bottleneck2D(in_channels=16,out_channels=16,expanded_channels=16,stride=2,use_se=True,nonlinearity=nn.ReLU(),dropout=0.2),
385
+ Bottleneck2D(in_channels=16,out_channels=24,expanded_channels=72,stride=2,nonlinearity=nn.ReLU(),dropout=0.2),
386
+ Bottleneck2D(in_channels=24,out_channels=24,expanded_channels=88,stride=1,nonlinearity=nn.ReLU(),dropout=0.2)
387
+ )
388
+ #5x5 bottlenecks (8, h-swish, squeeze-excite): 28x28x24 -> 7x7x96
389
+ self.block3 = nn.Sequential(
390
+ Bottleneck2D(in_channels=24,out_channels=40,expanded_channels=96,stride=2,use_se=True,kernel_size=5,dropout=0.2),
391
+ Bottleneck2D(in_channels=40,out_channels=40,expanded_channels=240,stride=1,use_se=True,kernel_size=5,dropout=0.2),
392
+ Bottleneck2D(in_channels=40,out_channels=40,expanded_channels=240,stride=1,use_se=True,kernel_size=5,dropout=0.2),
393
+ Bottleneck2D(in_channels=40,out_channels=48,expanded_channels=120,stride=1,use_se=True,kernel_size=5,dropout=0.2),
394
+ Bottleneck2D(in_channels=48,out_channels=48,expanded_channels=144,stride=1,use_se=True,kernel_size=5,dropout=0.2),
395
+ Bottleneck2D(in_channels=48,out_channels=96,expanded_channels=288,stride=2,use_se=True,kernel_size=5,dropout=0.2),
396
+ Bottleneck2D(in_channels=96,out_channels=96,expanded_channels=576,stride=1,use_se=True,kernel_size=5,dropout=0.2),
397
+ Bottleneck2D(in_channels=96,out_channels=96,expanded_channels=576,stride=1,use_se=True,kernel_size=5,dropout=0.2)
398
+ )
399
+ #conv2d (h-swish), avg pool 7x7: 7x7x96 -> 1x1x576
400
+ self.block4 = nn.Sequential(
401
+ nn.Conv2d(in_channels=96,out_channels=576,kernel_size=1,stride=1,padding=0),
402
+ SEBlock2D(channels=576),
403
+ nn.BatchNorm2d(576),
404
+ nn.Hardswish(),
405
+ nn.AvgPool2d(kernel_size=7,stride=1)
406
+ )
407
+ #LSTM: 1x1x576 ->
408
+ self.lstm = nn.LSTM(input_size=576,hidden_size=64,num_layers=1,batch_first=True)
409
+ #classifier: conv3d 1x1 NBN (2, first uses h-swish): 1x1x576
410
+ self.classifier = nn.Sequential(
411
+ nn.Linear(64,self.num_classes) #2 classes for ball/strike
412
+ )
413
+
414
+ def forward(self,x):
415
+ # x is of shape (batch_size, timesteps, C, H, W)
416
+ batch_size, timesteps, C, H, W = x.size()
417
+ cnn_out = torch.zeros(batch_size, timesteps, 576).to(x.device) #assuming the output of block4 is 576
418
+ #we're looping through the frames in the video
419
+ for i in range(timesteps):
420
+ # Select the frame at the ith position
421
+ frame = x[:, i, :, :, :]
422
+ frame = self.block1(frame)
423
+ frame = self.block2(frame)
424
+ frame = self.block3(frame)
425
+ frame = self.block4(frame)
426
+ # Flatten the frame (minus the batch dimension)
427
+ frame = frame.view(frame.size(0), -1)
428
+ cnn_out[:, i, :] = frame
429
+ # reshape for LSTM
430
+ x = cnn_out
431
+ x, _ = self.lstm(x)
432
+ # get the output from the last timestep only
433
+ x = x[:, -1, :]
434
+ x = self.classifier(x)
435
+ return x
436
+
437
+ def initialize_weights(self):
438
+ for module in self.modules():
439
+ if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
440
+ if hasattr(module, "nonlinearity"):
441
+ if module.nonlinearity == 'relu':
442
+ init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
443
+ elif module.nonlinearity == 'hardswish':
444
+ init.xavier_uniform_(module.weight)
445
+ elif isinstance(module, nn.BatchNorm2d):
446
+ init.constant_(module.weight, 1)
447
+ init.constant_(module.bias, 0)
picklebot_2m.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8361eaa6e117e4f26b76daf5dcc98003c2bbb59ecec5bac0ff7743143fb8e16a
3
+ size 240726237
weights/MobileNetLarge.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a851fd44dd5a96af4bb6a6525f69f727487a6c8908314f1d755a6c34a452bcaa
3
+ size 8454104