drsaikirant88 commited on
Commit
b202543
1 Parent(s): 1cd092f

initial commit with working code (local)

Browse files
Files changed (9) hide show
  1. .gitignore +3 -0
  2. README.md +1 -0
  3. app.py +354 -0
  4. cfg/openimages.names +601 -0
  5. cfg/yolov3-openimages.cfg +789 -0
  6. darknet.py +322 -0
  7. detect.py +161 -0
  8. requirements.txt +4 -0
  9. utils.py +237 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .ipynb_checkpoints
2
+ __pycache__
3
+ desktop.ini
README.md CHANGED
@@ -11,3 +11,4 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
app.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Facial Recognition with Emotion / Sentiment Detector
2
+
3
+ # This is a custom, hard-coded version of darknet with
4
+ # YOLOv3 implementation for openimages database. This
5
+ # was written to test viability of implementing YOLO
6
+ # for face detection followed by emotion / sentiment
7
+ # analysis.
8
+ #
9
+ # Configuration, weights and data are hardcoded.
10
+ # This version takes any images, detects faces,
11
+ # and then runs emotion / sentiment analysis
12
+ #
13
+ # Author : Saikiran Tharimena
14
+ # Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
15
+ # Project : Emotion / Sentiment Detection from news images
16
+ # Date : 12 September 2022
17
+ # Version : v0.1
18
+ #
19
+ # (C) Schibsted ASA
20
+
21
+ # Libraries
22
+ import torch
23
+ from utils import *
24
+ import gradio as gr
25
+ from numpy import array
26
+ from darknet import Darknet
27
+ from torch.autograd import Variable
28
+ from torch.cuda import is_available as check_cuda
29
+ from PIL.ImageOps import grayscale
30
+ from fastai.vision.all import PILImage, load_learner
31
+
32
+ ################## DARKNET ##################
33
+ # Parameters
34
+ batch_size = 1
35
+ confidence = 0.25
36
+ nms_thresh = 0.30
37
+ run_cuda = False
38
+
39
+ # CFG Files
40
+ cfg = 'cfg/yolov3-openimages.cfg'
41
+ clsnames= 'cfg/openimages.names'
42
+ weights = 'cfg/yolov3-openimages.weights'
43
+
44
+ # Load classes
45
+ classes = load_classes(clsnames)
46
+ num_classes = len(classes)
47
+
48
+ # Set up the neural network
49
+ print('Load Network')
50
+ model = Darknet(cfg)
51
+
52
+ print('Load Weights')
53
+ model.load_weights(weights)
54
+
55
+ print('Successfully loaded Network')
56
+
57
+ # Check CUDA
58
+ if run_cuda:
59
+ CUDA = check_cuda()
60
+ else:
61
+ CUDA = False
62
+
63
+ # Input dimension
64
+ inp_dim = int(model.net_info["height"])
65
+
66
+ # put the model on GPU
67
+ if CUDA:
68
+ model.cuda()
69
+
70
+ # Set the model in evaluation mode
71
+ model.eval()
72
+
73
+ def get_detections(x):
74
+ c1 = [int(y) for y in x[1:3]]
75
+ c2 = [int(y) for y in x[3:5]]
76
+
77
+ det_class = int(x[-1])
78
+ label = "{0}".format(classes[det_class])
79
+
80
+ return (label, tuple(c1 + c2))
81
+
82
+ # face detector
83
+ def detector(image):
84
+ # Just lazy to update this
85
+ imlist = [image]
86
+ loaded_ims = [image]
87
+
88
+ im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))
89
+ im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims]
90
+ im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
91
+
92
+ leftover = 0
93
+ if (len(im_dim_list) % batch_size):
94
+ leftover = 1
95
+
96
+ if batch_size != 1:
97
+ num_batches = len(imlist) // batch_size + leftover
98
+ im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size,
99
+ len(im_batches))])) for i in range(num_batches)]
100
+
101
+ write = 0
102
+ if CUDA:
103
+ im_dim_list = im_dim_list.cuda()
104
+
105
+ for i, batch in enumerate(im_batches):
106
+ # load the image
107
+
108
+ if CUDA:
109
+ batch = batch.cuda()
110
+ with torch.no_grad():
111
+ prediction = model(Variable(batch), CUDA)
112
+
113
+ prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh)
114
+
115
+ if type(prediction) == int:
116
+
117
+ for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
118
+ im_id = i*batch_size + im_num
119
+
120
+ continue
121
+
122
+ prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist
123
+
124
+ if not write: # If we have't initialised output
125
+ output = prediction
126
+ write = 1
127
+ else:
128
+ output = torch.cat((output, prediction))
129
+
130
+ for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
131
+ im_id = i * batch_size + im_num
132
+ objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
133
+
134
+ if CUDA:
135
+ torch.cuda.synchronize()
136
+
137
+ try:
138
+ output
139
+ except NameError:
140
+ return None
141
+
142
+ im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
143
+
144
+ scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1)
145
+
146
+ output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
147
+ output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
148
+
149
+ output[:, 1:5] /= scaling_factor
150
+
151
+ for i in range(output.shape[0]):
152
+ output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
153
+ output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
154
+
155
+ detections = list(map(get_detections, output))
156
+
157
+ if CUDA:
158
+ torch.cuda.empty_cache()
159
+
160
+ return loaded_ims[0], detections
161
+ #############################################
162
+
163
+
164
+ # Emotion
165
+ learn_emotion = load_learner('models/emotions_vgg19.pkl')
166
+ learn_emotion_labels = learn_emotion.dls.vocab
167
+
168
+ # Sentiment
169
+ learn_sentiment = load_learner('models/sentiment_vgg19.pkl')
170
+ learn_sentiment_labels = learn_sentiment.dls.vocab
171
+
172
+ def crop_images(img, bbox):
173
+ "Here image should be an image object from PILImage.create"
174
+
175
+ # Coordinates of face in cv2 format
176
+ xmin, ymin, xmax, ymax = bbox[1]
177
+
178
+ # resize and crop face
179
+ return img.crop((xmin, ymin, xmax, ymax))
180
+
181
+
182
+ def detect_person_face(img, detections):
183
+ '''This function is called from within detect face.
184
+ If only a person is detected, then this will crop
185
+ image and then try to detect face again.'''
186
+
187
+ faces = []
188
+
189
+ # Loop through people
190
+ for detection in detections:
191
+
192
+ # Get cropped image of person
193
+ temp = crop_images(img, detection)
194
+
195
+ # run detector again
196
+ _, detect = detector(array(temp)[...,:3])
197
+
198
+ # check for human faces
199
+ human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face']
200
+
201
+ if len(human_face) == 0:
202
+ continue
203
+
204
+ # Force it to take only 1 face per person
205
+ # crop face and append to list
206
+ faces.append(crop_images(temp, detect[human_face[0]]))
207
+
208
+ return faces
209
+
210
+
211
+ def detect_face(img):
212
+
213
+ _, detections = detector(array(img)[...,:3])
214
+
215
+ # check for human faces
216
+ human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face']
217
+
218
+ if len(human_face) == 0:
219
+ human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person']
220
+
221
+ if len(human_face) == 0:
222
+ return None
223
+ else:
224
+ # Only get human face detections
225
+ faces = detect_person_face(img, [detections[idx] for idx in human_face])
226
+
227
+ else:
228
+ # Only get human face detections
229
+ faces = []
230
+
231
+ for idx in human_face:
232
+ faces.append(crop_images(img, detections[idx]))
233
+
234
+ return faces
235
+
236
+
237
+ # Predict
238
+ def predict(img):
239
+
240
+ img = PILImage.create(img)
241
+
242
+ # Detect faces
243
+ faces = detect_face(img)
244
+
245
+ output = []
246
+
247
+ if len(faces) == 0:
248
+
249
+ img = img.resize(48, 48)
250
+
251
+ pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img)))
252
+
253
+ pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img)))
254
+
255
+ emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))}
256
+ sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))}
257
+
258
+ output = [img.resize((48, 48)), emotions, sentiments, None, None, None, None, None, None]
259
+
260
+ else: # Max 3 for now
261
+ for face in faces[:3]:
262
+
263
+ img = face.resize((48, 48))
264
+
265
+ pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img)))
266
+
267
+ pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img)))
268
+
269
+ emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))}
270
+ sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))}
271
+
272
+ output.append(img)
273
+ output.append(emotions)
274
+ output.append(sentiments)
275
+
276
+ temp = output[-3:]
277
+ while len(output) < 9:
278
+ output = output + temp
279
+
280
+ return output
281
+
282
+ # Gradio
283
+ title = 'Face Recognition with Emotion and Sentiment Detector'
284
+
285
+ description = gr.Markdown(
286
+ """Ever wondered what a person might be feeling looking at their picture?
287
+ Well, now you can! Try this fun app. Just upload a facial image in JPG or
288
+ PNG format. Voila! you can now see what they might have felt when the picture
289
+ was taken.
290
+
291
+ This is an updated version of Facial Expression Classifier:
292
+ https://huggingface.co/spaces/schibsted/facial_expression_classifier
293
+ """).value
294
+
295
+ article = gr.Markdown(
296
+ """**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and
297
+ interpret results at your own risk! It was built as a demo for AI course. Samples images
298
+ were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective
299
+ brands. All rights reserved.
300
+
301
+ **PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis
302
+ based on the pictures. We are restricting pictures to only include close-up facial
303
+ images.
304
+
305
+ **DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709
306
+ images in the training set and 3,589 images in the test set. However, for this demo all
307
+ pictures were combined into a single dataset and 80:20 split was used for training. Images
308
+ are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral.
309
+ In addition to these 7 classes, images were re-classified into 3 sentiment categories based
310
+ on emotions:
311
+
312
+ Positive (Happy, Surprise)
313
+
314
+ Negative (Angry, Disgust, Fear, Sad)
315
+
316
+ Neutral (Neutral)
317
+
318
+ FER2013 (preliminary version) dataset can be downloaded at:
319
+ https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data
320
+
321
+ **EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained
322
+ using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other
323
+ for detecting sentiment. Although, this could have been done with just one model, here two
324
+ models were trained for the demo.
325
+
326
+ **FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details.
327
+ In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment
328
+ model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through
329
+ face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not
330
+ detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't
331
+ figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was
332
+ too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture.
333
+ """).value
334
+
335
+ enable_queue=True
336
+
337
+ examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg']
338
+
339
+ gr.Interface(fn = predict,
340
+ inputs = gr.Image(),
341
+ outputs = [gr.Image(shape=(24, 24), label='Person 1'),
342
+ gr.Label(label='Emotion - Person 1'),
343
+ gr.Label(label='Sentiment - Person 1'),
344
+ gr.Image(shape=(24, 24), label='Person 2'),
345
+ gr.Label(label='Emotion - Person 2'),
346
+ gr.Label(label='Sentiment - Person 2'),
347
+ gr.Image(shape=(24, 24), label='Person 3'),
348
+ gr.Label(label='Emotion - Person 3'),
349
+ gr.Label(label='Sentiment - Person 3'),], #gr.Label(),
350
+ title = title,
351
+ examples = examples,
352
+ description = description,
353
+ article=article,
354
+ allow_flagging='never').launch(enable_queue=enable_queue)
cfg/openimages.names ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tortoise
2
+ Container
3
+ Magpie
4
+ Sea turtle
5
+ Football
6
+ Ambulance
7
+ Ladder
8
+ Toothbrush
9
+ Syringe
10
+ Sink
11
+ Toy
12
+ Organ
13
+ Cassette deck
14
+ Apple
15
+ Human eye
16
+ Cosmetics
17
+ Paddle
18
+ Snowman
19
+ Beer
20
+ Chopsticks
21
+ Human beard
22
+ Bird
23
+ Parking meter
24
+ Traffic light
25
+ Croissant
26
+ Cucumber
27
+ Radish
28
+ Towel
29
+ Doll
30
+ Skull
31
+ Washing machine
32
+ Glove
33
+ Tick
34
+ Belt
35
+ Sunglasses
36
+ Banjo
37
+ Cart
38
+ Ball
39
+ Backpack
40
+ Bicycle
41
+ Home appliance
42
+ Centipede
43
+ Boat
44
+ Surfboard
45
+ Boot
46
+ Headphones
47
+ Hot dog
48
+ Shorts
49
+ Fast food
50
+ Bus
51
+ Boy
52
+ Screwdriver
53
+ Bicycle wheel
54
+ Barge
55
+ Laptop
56
+ Miniskirt
57
+ Drill
58
+ Dress
59
+ Bear
60
+ Waffle
61
+ Pancake
62
+ Brown bear
63
+ Woodpecker
64
+ Blue jay
65
+ Pretzel
66
+ Bagel
67
+ Tower
68
+ Teapot
69
+ Person
70
+ Bow and arrow
71
+ Swimwear
72
+ Beehive
73
+ Brassiere
74
+ Bee
75
+ Bat
76
+ Starfish
77
+ Popcorn
78
+ Burrito
79
+ Chainsaw
80
+ Balloon
81
+ Wrench
82
+ Tent
83
+ Vehicle registration plate
84
+ Lantern
85
+ Toaster
86
+ Flashlight
87
+ Billboard
88
+ Tiara
89
+ Limousine
90
+ Necklace
91
+ Carnivore
92
+ Scissors
93
+ Stairs
94
+ Computer keyboard
95
+ Printer
96
+ Traffic sign
97
+ Chair
98
+ Shirt
99
+ Poster
100
+ Cheese
101
+ Sock
102
+ Fire hydrant
103
+ Land vehicle
104
+ Earrings
105
+ Tie
106
+ Watercraft
107
+ Cabinetry
108
+ Suitcase
109
+ Muffin
110
+ Bidet
111
+ Snack
112
+ Snowmobile
113
+ Clock
114
+ Medical equipment
115
+ Cattle
116
+ Cello
117
+ Jet ski
118
+ Camel
119
+ Coat
120
+ Suit
121
+ Desk
122
+ Cat
123
+ Bronze sculpture
124
+ Juice
125
+ Gondola
126
+ Beetle
127
+ Cannon
128
+ Computer mouse
129
+ Cookie
130
+ Office building
131
+ Fountain
132
+ Coin
133
+ Calculator
134
+ Cocktail
135
+ Computer monitor
136
+ Box
137
+ Stapler
138
+ Christmas tree
139
+ Cowboy hat
140
+ Hiking equipment
141
+ Studio couch
142
+ Drum
143
+ Dessert
144
+ Wine rack
145
+ Drink
146
+ Zucchini
147
+ Ladle
148
+ Human mouth
149
+ Dairy
150
+ Dice
151
+ Oven
152
+ Dinosaur
153
+ Ratchet
154
+ Couch
155
+ Cricket ball
156
+ Winter melon
157
+ Spatula
158
+ Whiteboard
159
+ Pencil sharpener
160
+ Door
161
+ Hat
162
+ Shower
163
+ Eraser
164
+ Fedora
165
+ Guacamole
166
+ Dagger
167
+ Scarf
168
+ Dolphin
169
+ Sombrero
170
+ Tin can
171
+ Mug
172
+ Tap
173
+ Harbor seal
174
+ Stretcher
175
+ Can opener
176
+ Goggles
177
+ Human body
178
+ Roller skates
179
+ Coffee cup
180
+ Cutting board
181
+ Blender
182
+ Plumbing fixture
183
+ Stop sign
184
+ Office supplies
185
+ Volleyball
186
+ Vase
187
+ Slow cooker
188
+ Wardrobe
189
+ Coffee
190
+ Whisk
191
+ Paper towel
192
+ Personal care
193
+ Food
194
+ Sun hat
195
+ Tree house
196
+ Flying disc
197
+ Skirt
198
+ Gas stove
199
+ Salt and pepper shakers
200
+ Mechanical fan
201
+ Face powder
202
+ Fax
203
+ Fruit
204
+ French fries
205
+ Nightstand
206
+ Barrel
207
+ Kite
208
+ Tart
209
+ Treadmill
210
+ Fox
211
+ Flag
212
+ Horn
213
+ Window blind
214
+ Human foot
215
+ Golf cart
216
+ Jacket
217
+ Egg
218
+ Street light
219
+ Guitar
220
+ Pillow
221
+ Human leg
222
+ Isopod
223
+ Grape
224
+ Human ear
225
+ Power plugs and sockets
226
+ Panda
227
+ Giraffe
228
+ Woman
229
+ Door handle
230
+ Rhinoceros
231
+ Bathtub
232
+ Goldfish
233
+ Houseplant
234
+ Goat
235
+ Baseball bat
236
+ Baseball glove
237
+ Mixing bowl
238
+ Marine invertebrates
239
+ Kitchen utensil
240
+ Light switch
241
+ House
242
+ Horse
243
+ Stationary bicycle
244
+ Hammer
245
+ Ceiling fan
246
+ Sofa bed
247
+ Adhesive tape
248
+ Harp
249
+ Sandal
250
+ Bicycle helmet
251
+ Saucer
252
+ Harpsichord
253
+ Human hair
254
+ Heater
255
+ Harmonica
256
+ Hamster
257
+ Curtain
258
+ Bed
259
+ Kettle
260
+ Fireplace
261
+ Scale
262
+ Drinking straw
263
+ Insect
264
+ Hair dryer
265
+ Kitchenware
266
+ Indoor rower
267
+ Invertebrate
268
+ Food processor
269
+ Bookcase
270
+ Refrigerator
271
+ Wood-burning stove
272
+ Punching bag
273
+ Common fig
274
+ Cocktail shaker
275
+ Jaguar
276
+ Golf ball
277
+ Fashion accessory
278
+ Alarm clock
279
+ Filing cabinet
280
+ Artichoke
281
+ Table
282
+ Tableware
283
+ Kangaroo
284
+ Koala
285
+ Knife
286
+ Bottle
287
+ Bottle opener
288
+ Lynx
289
+ Lavender
290
+ Lighthouse
291
+ Dumbbell
292
+ Human head
293
+ Bowl
294
+ Humidifier
295
+ Porch
296
+ Lizard
297
+ Billiard table
298
+ Mammal
299
+ Mouse
300
+ Motorcycle
301
+ Musical instrument
302
+ Swim cap
303
+ Frying pan
304
+ Snowplow
305
+ Bathroom cabinet
306
+ Missile
307
+ Bust
308
+ Man
309
+ Waffle iron
310
+ Milk
311
+ Ring binder
312
+ Plate
313
+ Mobile phone
314
+ Baked goods
315
+ Mushroom
316
+ Crutch
317
+ Pitcher
318
+ Mirror
319
+ Lifejacket
320
+ Table tennis racket
321
+ Pencil case
322
+ Musical keyboard
323
+ Scoreboard
324
+ Briefcase
325
+ Kitchen knife
326
+ Nail
327
+ Tennis ball
328
+ Plastic bag
329
+ Oboe
330
+ Chest of drawers
331
+ Ostrich
332
+ Piano
333
+ Girl
334
+ Plant
335
+ Potato
336
+ Hair spray
337
+ Sports equipment
338
+ Pasta
339
+ Penguin
340
+ Pumpkin
341
+ Pear
342
+ Infant bed
343
+ Polar bear
344
+ Mixer
345
+ Cupboard
346
+ Jacuzzi
347
+ Pizza
348
+ Digital clock
349
+ Pig
350
+ Reptile
351
+ Rifle
352
+ Lipstick
353
+ Skateboard
354
+ Raven
355
+ High heels
356
+ Red panda
357
+ Rose
358
+ Rabbit
359
+ Sculpture
360
+ Saxophone
361
+ Shotgun
362
+ Seafood
363
+ Submarine sandwich
364
+ Snowboard
365
+ Sword
366
+ Picture frame
367
+ Sushi
368
+ Loveseat
369
+ Ski
370
+ Squirrel
371
+ Tripod
372
+ Stethoscope
373
+ Submarine
374
+ Scorpion
375
+ Segway
376
+ Training bench
377
+ Snake
378
+ Coffee table
379
+ Skyscraper
380
+ Sheep
381
+ Television
382
+ Trombone
383
+ Tea
384
+ Tank
385
+ Taco
386
+ Telephone
387
+ Torch
388
+ Tiger
389
+ Strawberry
390
+ Trumpet
391
+ Tree
392
+ Tomato
393
+ Train
394
+ Tool
395
+ Picnic basket
396
+ Cooking spray
397
+ Trousers
398
+ Bowling equipment
399
+ Football helmet
400
+ Truck
401
+ Measuring cup
402
+ Coffeemaker
403
+ Violin
404
+ Vehicle
405
+ Handbag
406
+ Paper cutter
407
+ Wine
408
+ Weapon
409
+ Wheel
410
+ Worm
411
+ Wok
412
+ Whale
413
+ Zebra
414
+ Auto part
415
+ Jug
416
+ Pizza cutter
417
+ Cream
418
+ Monkey
419
+ Lion
420
+ Bread
421
+ Platter
422
+ Chicken
423
+ Eagle
424
+ Helicopter
425
+ Owl
426
+ Duck
427
+ Turtle
428
+ Hippopotamus
429
+ Crocodile
430
+ Toilet
431
+ Toilet paper
432
+ Squid
433
+ Clothing
434
+ Footwear
435
+ Lemon
436
+ Spider
437
+ Deer
438
+ Frog
439
+ Banana
440
+ Rocket
441
+ Wine glass
442
+ Countertop
443
+ Tablet computer
444
+ Waste container
445
+ Swimming pool
446
+ Dog
447
+ Book
448
+ Elephant
449
+ Shark
450
+ Candle
451
+ Leopard
452
+ Axe
453
+ Hand dryer
454
+ Soap dispenser
455
+ Porcupine
456
+ Flower
457
+ Canary
458
+ Cheetah
459
+ Palm tree
460
+ Hamburger
461
+ Maple
462
+ Building
463
+ Fish
464
+ Lobster
465
+ Asparagus
466
+ Furniture
467
+ Hedgehog
468
+ Airplane
469
+ Spoon
470
+ Otter
471
+ Bull
472
+ Oyster
473
+ Horizontal bar
474
+ Convenience store
475
+ Bomb
476
+ Bench
477
+ Ice cream
478
+ Caterpillar
479
+ Butterfly
480
+ Parachute
481
+ Orange
482
+ Antelope
483
+ Beaker
484
+ Moths and butterflies
485
+ Window
486
+ Closet
487
+ Castle
488
+ Jellyfish
489
+ Goose
490
+ Mule
491
+ Swan
492
+ Peach
493
+ Coconut
494
+ Seat belt
495
+ Raccoon
496
+ Chisel
497
+ Fork
498
+ Lamp
499
+ Camera
500
+ Squash
501
+ Racket
502
+ Human face
503
+ Human arm
504
+ Vegetable
505
+ Diaper
506
+ Unicycle
507
+ Falcon
508
+ Chime
509
+ Snail
510
+ Shellfish
511
+ Cabbage
512
+ Carrot
513
+ Mango
514
+ Jeans
515
+ Flowerpot
516
+ Pineapple
517
+ Drawer
518
+ Stool
519
+ Envelope
520
+ Cake
521
+ Dragonfly
522
+ Sunflower
523
+ Microwave oven
524
+ Honeycomb
525
+ Marine mammal
526
+ Sea lion
527
+ Ladybug
528
+ Shelf
529
+ Watch
530
+ Candy
531
+ Salad
532
+ Parrot
533
+ Handgun
534
+ Sparrow
535
+ Van
536
+ Grinder
537
+ Spice rack
538
+ Light bulb
539
+ Corded phone
540
+ Sports uniform
541
+ Tennis racket
542
+ Wall clock
543
+ Serving tray
544
+ Kitchen & dining room table
545
+ Dog bed
546
+ Cake stand
547
+ Cat furniture
548
+ Bathroom accessory
549
+ Facial tissue holder
550
+ Pressure cooker
551
+ Kitchen appliance
552
+ Tire
553
+ Ruler
554
+ Luggage and bags
555
+ Microphone
556
+ Broccoli
557
+ Umbrella
558
+ Pastry
559
+ Grapefruit
560
+ Band-aid
561
+ Animal
562
+ Bell pepper
563
+ Turkey
564
+ Lily
565
+ Pomegranate
566
+ Doughnut
567
+ Glasses
568
+ Human nose
569
+ Pen
570
+ Ant
571
+ Car
572
+ Aircraft
573
+ Human hand
574
+ Skunk
575
+ Teddy bear
576
+ Watermelon
577
+ Cantaloupe
578
+ Dishwasher
579
+ Flute
580
+ Balance beam
581
+ Sandwich
582
+ Shrimp
583
+ Sewing machine
584
+ Binoculars
585
+ Rays and skates
586
+ Ipod
587
+ Accordion
588
+ Willow
589
+ Crab
590
+ Crown
591
+ Seahorse
592
+ Perfume
593
+ Alpaca
594
+ Taxi
595
+ Canoe
596
+ Remote control
597
+ Wheelchair
598
+ Rugby ball
599
+ Armadillo
600
+ Maracas
601
+ Helmet
cfg/yolov3-openimages.cfg ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ # Testing
3
+ batch=1
4
+ subdivisions=1
5
+ # Training
6
+ batch=64
7
+ subdivisions=16
8
+ width=608
9
+ height=608
10
+ channels=3
11
+ momentum=0.9
12
+ decay=0.0005
13
+ angle=0
14
+ saturation = 1.5
15
+ exposure = 1.5
16
+ hue=.1
17
+
18
+ learning_rate=0.001
19
+ burn_in=5000
20
+ max_batches = 500200
21
+ policy=steps
22
+ steps=400000,450000
23
+ scales=.1,.1
24
+
25
+ [convolutional]
26
+ batch_normalize=1
27
+ filters=32
28
+ size=3
29
+ stride=1
30
+ pad=1
31
+ activation=leaky
32
+
33
+ # Downsample
34
+
35
+ [convolutional]
36
+ batch_normalize=1
37
+ filters=64
38
+ size=3
39
+ stride=2
40
+ pad=1
41
+ activation=leaky
42
+
43
+ [convolutional]
44
+ batch_normalize=1
45
+ filters=32
46
+ size=1
47
+ stride=1
48
+ pad=1
49
+ activation=leaky
50
+
51
+ [convolutional]
52
+ batch_normalize=1
53
+ filters=64
54
+ size=3
55
+ stride=1
56
+ pad=1
57
+ activation=leaky
58
+
59
+ [shortcut]
60
+ from=-3
61
+ activation=linear
62
+
63
+ # Downsample
64
+
65
+ [convolutional]
66
+ batch_normalize=1
67
+ filters=128
68
+ size=3
69
+ stride=2
70
+ pad=1
71
+ activation=leaky
72
+
73
+ [convolutional]
74
+ batch_normalize=1
75
+ filters=64
76
+ size=1
77
+ stride=1
78
+ pad=1
79
+ activation=leaky
80
+
81
+ [convolutional]
82
+ batch_normalize=1
83
+ filters=128
84
+ size=3
85
+ stride=1
86
+ pad=1
87
+ activation=leaky
88
+
89
+ [shortcut]
90
+ from=-3
91
+ activation=linear
92
+
93
+ [convolutional]
94
+ batch_normalize=1
95
+ filters=64
96
+ size=1
97
+ stride=1
98
+ pad=1
99
+ activation=leaky
100
+
101
+ [convolutional]
102
+ batch_normalize=1
103
+ filters=128
104
+ size=3
105
+ stride=1
106
+ pad=1
107
+ activation=leaky
108
+
109
+ [shortcut]
110
+ from=-3
111
+ activation=linear
112
+
113
+ # Downsample
114
+
115
+ [convolutional]
116
+ batch_normalize=1
117
+ filters=256
118
+ size=3
119
+ stride=2
120
+ pad=1
121
+ activation=leaky
122
+
123
+ [convolutional]
124
+ batch_normalize=1
125
+ filters=128
126
+ size=1
127
+ stride=1
128
+ pad=1
129
+ activation=leaky
130
+
131
+ [convolutional]
132
+ batch_normalize=1
133
+ filters=256
134
+ size=3
135
+ stride=1
136
+ pad=1
137
+ activation=leaky
138
+
139
+ [shortcut]
140
+ from=-3
141
+ activation=linear
142
+
143
+ [convolutional]
144
+ batch_normalize=1
145
+ filters=128
146
+ size=1
147
+ stride=1
148
+ pad=1
149
+ activation=leaky
150
+
151
+ [convolutional]
152
+ batch_normalize=1
153
+ filters=256
154
+ size=3
155
+ stride=1
156
+ pad=1
157
+ activation=leaky
158
+
159
+ [shortcut]
160
+ from=-3
161
+ activation=linear
162
+
163
+ [convolutional]
164
+ batch_normalize=1
165
+ filters=128
166
+ size=1
167
+ stride=1
168
+ pad=1
169
+ activation=leaky
170
+
171
+ [convolutional]
172
+ batch_normalize=1
173
+ filters=256
174
+ size=3
175
+ stride=1
176
+ pad=1
177
+ activation=leaky
178
+
179
+ [shortcut]
180
+ from=-3
181
+ activation=linear
182
+
183
+ [convolutional]
184
+ batch_normalize=1
185
+ filters=128
186
+ size=1
187
+ stride=1
188
+ pad=1
189
+ activation=leaky
190
+
191
+ [convolutional]
192
+ batch_normalize=1
193
+ filters=256
194
+ size=3
195
+ stride=1
196
+ pad=1
197
+ activation=leaky
198
+
199
+ [shortcut]
200
+ from=-3
201
+ activation=linear
202
+
203
+
204
+ [convolutional]
205
+ batch_normalize=1
206
+ filters=128
207
+ size=1
208
+ stride=1
209
+ pad=1
210
+ activation=leaky
211
+
212
+ [convolutional]
213
+ batch_normalize=1
214
+ filters=256
215
+ size=3
216
+ stride=1
217
+ pad=1
218
+ activation=leaky
219
+
220
+ [shortcut]
221
+ from=-3
222
+ activation=linear
223
+
224
+ [convolutional]
225
+ batch_normalize=1
226
+ filters=128
227
+ size=1
228
+ stride=1
229
+ pad=1
230
+ activation=leaky
231
+
232
+ [convolutional]
233
+ batch_normalize=1
234
+ filters=256
235
+ size=3
236
+ stride=1
237
+ pad=1
238
+ activation=leaky
239
+
240
+ [shortcut]
241
+ from=-3
242
+ activation=linear
243
+
244
+ [convolutional]
245
+ batch_normalize=1
246
+ filters=128
247
+ size=1
248
+ stride=1
249
+ pad=1
250
+ activation=leaky
251
+
252
+ [convolutional]
253
+ batch_normalize=1
254
+ filters=256
255
+ size=3
256
+ stride=1
257
+ pad=1
258
+ activation=leaky
259
+
260
+ [shortcut]
261
+ from=-3
262
+ activation=linear
263
+
264
+ [convolutional]
265
+ batch_normalize=1
266
+ filters=128
267
+ size=1
268
+ stride=1
269
+ pad=1
270
+ activation=leaky
271
+
272
+ [convolutional]
273
+ batch_normalize=1
274
+ filters=256
275
+ size=3
276
+ stride=1
277
+ pad=1
278
+ activation=leaky
279
+
280
+ [shortcut]
281
+ from=-3
282
+ activation=linear
283
+
284
+ # Downsample
285
+
286
+ [convolutional]
287
+ batch_normalize=1
288
+ filters=512
289
+ size=3
290
+ stride=2
291
+ pad=1
292
+ activation=leaky
293
+
294
+ [convolutional]
295
+ batch_normalize=1
296
+ filters=256
297
+ size=1
298
+ stride=1
299
+ pad=1
300
+ activation=leaky
301
+
302
+ [convolutional]
303
+ batch_normalize=1
304
+ filters=512
305
+ size=3
306
+ stride=1
307
+ pad=1
308
+ activation=leaky
309
+
310
+ [shortcut]
311
+ from=-3
312
+ activation=linear
313
+
314
+
315
+ [convolutional]
316
+ batch_normalize=1
317
+ filters=256
318
+ size=1
319
+ stride=1
320
+ pad=1
321
+ activation=leaky
322
+
323
+ [convolutional]
324
+ batch_normalize=1
325
+ filters=512
326
+ size=3
327
+ stride=1
328
+ pad=1
329
+ activation=leaky
330
+
331
+ [shortcut]
332
+ from=-3
333
+ activation=linear
334
+
335
+
336
+ [convolutional]
337
+ batch_normalize=1
338
+ filters=256
339
+ size=1
340
+ stride=1
341
+ pad=1
342
+ activation=leaky
343
+
344
+ [convolutional]
345
+ batch_normalize=1
346
+ filters=512
347
+ size=3
348
+ stride=1
349
+ pad=1
350
+ activation=leaky
351
+
352
+ [shortcut]
353
+ from=-3
354
+ activation=linear
355
+
356
+
357
+ [convolutional]
358
+ batch_normalize=1
359
+ filters=256
360
+ size=1
361
+ stride=1
362
+ pad=1
363
+ activation=leaky
364
+
365
+ [convolutional]
366
+ batch_normalize=1
367
+ filters=512
368
+ size=3
369
+ stride=1
370
+ pad=1
371
+ activation=leaky
372
+
373
+ [shortcut]
374
+ from=-3
375
+ activation=linear
376
+
377
+ [convolutional]
378
+ batch_normalize=1
379
+ filters=256
380
+ size=1
381
+ stride=1
382
+ pad=1
383
+ activation=leaky
384
+
385
+ [convolutional]
386
+ batch_normalize=1
387
+ filters=512
388
+ size=3
389
+ stride=1
390
+ pad=1
391
+ activation=leaky
392
+
393
+ [shortcut]
394
+ from=-3
395
+ activation=linear
396
+
397
+
398
+ [convolutional]
399
+ batch_normalize=1
400
+ filters=256
401
+ size=1
402
+ stride=1
403
+ pad=1
404
+ activation=leaky
405
+
406
+ [convolutional]
407
+ batch_normalize=1
408
+ filters=512
409
+ size=3
410
+ stride=1
411
+ pad=1
412
+ activation=leaky
413
+
414
+ [shortcut]
415
+ from=-3
416
+ activation=linear
417
+
418
+
419
+ [convolutional]
420
+ batch_normalize=1
421
+ filters=256
422
+ size=1
423
+ stride=1
424
+ pad=1
425
+ activation=leaky
426
+
427
+ [convolutional]
428
+ batch_normalize=1
429
+ filters=512
430
+ size=3
431
+ stride=1
432
+ pad=1
433
+ activation=leaky
434
+
435
+ [shortcut]
436
+ from=-3
437
+ activation=linear
438
+
439
+ [convolutional]
440
+ batch_normalize=1
441
+ filters=256
442
+ size=1
443
+ stride=1
444
+ pad=1
445
+ activation=leaky
446
+
447
+ [convolutional]
448
+ batch_normalize=1
449
+ filters=512
450
+ size=3
451
+ stride=1
452
+ pad=1
453
+ activation=leaky
454
+
455
+ [shortcut]
456
+ from=-3
457
+ activation=linear
458
+
459
+ # Downsample
460
+
461
+ [convolutional]
462
+ batch_normalize=1
463
+ filters=1024
464
+ size=3
465
+ stride=2
466
+ pad=1
467
+ activation=leaky
468
+
469
+ [convolutional]
470
+ batch_normalize=1
471
+ filters=512
472
+ size=1
473
+ stride=1
474
+ pad=1
475
+ activation=leaky
476
+
477
+ [convolutional]
478
+ batch_normalize=1
479
+ filters=1024
480
+ size=3
481
+ stride=1
482
+ pad=1
483
+ activation=leaky
484
+
485
+ [shortcut]
486
+ from=-3
487
+ activation=linear
488
+
489
+ [convolutional]
490
+ batch_normalize=1
491
+ filters=512
492
+ size=1
493
+ stride=1
494
+ pad=1
495
+ activation=leaky
496
+
497
+ [convolutional]
498
+ batch_normalize=1
499
+ filters=1024
500
+ size=3
501
+ stride=1
502
+ pad=1
503
+ activation=leaky
504
+
505
+ [shortcut]
506
+ from=-3
507
+ activation=linear
508
+
509
+ [convolutional]
510
+ batch_normalize=1
511
+ filters=512
512
+ size=1
513
+ stride=1
514
+ pad=1
515
+ activation=leaky
516
+
517
+ [convolutional]
518
+ batch_normalize=1
519
+ filters=1024
520
+ size=3
521
+ stride=1
522
+ pad=1
523
+ activation=leaky
524
+
525
+ [shortcut]
526
+ from=-3
527
+ activation=linear
528
+
529
+ [convolutional]
530
+ batch_normalize=1
531
+ filters=512
532
+ size=1
533
+ stride=1
534
+ pad=1
535
+ activation=leaky
536
+
537
+ [convolutional]
538
+ batch_normalize=1
539
+ filters=1024
540
+ size=3
541
+ stride=1
542
+ pad=1
543
+ activation=leaky
544
+
545
+ [shortcut]
546
+ from=-3
547
+ activation=linear
548
+
549
+ ######################
550
+
551
+ [convolutional]
552
+ batch_normalize=1
553
+ filters=512
554
+ size=1
555
+ stride=1
556
+ pad=1
557
+ activation=leaky
558
+
559
+ [convolutional]
560
+ batch_normalize=1
561
+ size=3
562
+ stride=1
563
+ pad=1
564
+ filters=1024
565
+ activation=leaky
566
+
567
+ [convolutional]
568
+ batch_normalize=1
569
+ filters=512
570
+ size=1
571
+ stride=1
572
+ pad=1
573
+ activation=leaky
574
+
575
+ [convolutional]
576
+ batch_normalize=1
577
+ size=3
578
+ stride=1
579
+ pad=1
580
+ filters=1024
581
+ activation=leaky
582
+
583
+ [convolutional]
584
+ batch_normalize=1
585
+ filters=512
586
+ size=1
587
+ stride=1
588
+ pad=1
589
+ activation=leaky
590
+
591
+ [convolutional]
592
+ batch_normalize=1
593
+ size=3
594
+ stride=1
595
+ pad=1
596
+ filters=1024
597
+ activation=leaky
598
+
599
+ [convolutional]
600
+ size=1
601
+ stride=1
602
+ pad=1
603
+ filters=1818
604
+ activation=linear
605
+
606
+
607
+ [yolo]
608
+ mask = 6,7,8
609
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610
+ classes=601
611
+ num=9
612
+ jitter=.3
613
+ ignore_thresh = .7
614
+ truth_thresh = 1
615
+ random=1
616
+
617
+
618
+ [route]
619
+ layers = -4
620
+
621
+ [convolutional]
622
+ batch_normalize=1
623
+ filters=256
624
+ size=1
625
+ stride=1
626
+ pad=1
627
+ activation=leaky
628
+
629
+ [upsample]
630
+ stride=2
631
+
632
+ [route]
633
+ layers = -1, 61
634
+
635
+
636
+
637
+ [convolutional]
638
+ batch_normalize=1
639
+ filters=256
640
+ size=1
641
+ stride=1
642
+ pad=1
643
+ activation=leaky
644
+
645
+ [convolutional]
646
+ batch_normalize=1
647
+ size=3
648
+ stride=1
649
+ pad=1
650
+ filters=512
651
+ activation=leaky
652
+
653
+ [convolutional]
654
+ batch_normalize=1
655
+ filters=256
656
+ size=1
657
+ stride=1
658
+ pad=1
659
+ activation=leaky
660
+
661
+ [convolutional]
662
+ batch_normalize=1
663
+ size=3
664
+ stride=1
665
+ pad=1
666
+ filters=512
667
+ activation=leaky
668
+
669
+ [convolutional]
670
+ batch_normalize=1
671
+ filters=256
672
+ size=1
673
+ stride=1
674
+ pad=1
675
+ activation=leaky
676
+
677
+ [convolutional]
678
+ batch_normalize=1
679
+ size=3
680
+ stride=1
681
+ pad=1
682
+ filters=512
683
+ activation=leaky
684
+
685
+ [convolutional]
686
+ size=1
687
+ stride=1
688
+ pad=1
689
+ filters=1818
690
+ activation=linear
691
+
692
+
693
+ [yolo]
694
+ mask = 3,4,5
695
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696
+ classes=601
697
+ num=9
698
+ jitter=.3
699
+ ignore_thresh = .7
700
+ truth_thresh = 1
701
+ random=1
702
+
703
+
704
+
705
+ [route]
706
+ layers = -4
707
+
708
+ [convolutional]
709
+ batch_normalize=1
710
+ filters=128
711
+ size=1
712
+ stride=1
713
+ pad=1
714
+ activation=leaky
715
+
716
+ [upsample]
717
+ stride=2
718
+
719
+ [route]
720
+ layers = -1, 36
721
+
722
+
723
+
724
+ [convolutional]
725
+ batch_normalize=1
726
+ filters=128
727
+ size=1
728
+ stride=1
729
+ pad=1
730
+ activation=leaky
731
+
732
+ [convolutional]
733
+ batch_normalize=1
734
+ size=3
735
+ stride=1
736
+ pad=1
737
+ filters=256
738
+ activation=leaky
739
+
740
+ [convolutional]
741
+ batch_normalize=1
742
+ filters=128
743
+ size=1
744
+ stride=1
745
+ pad=1
746
+ activation=leaky
747
+
748
+ [convolutional]
749
+ batch_normalize=1
750
+ size=3
751
+ stride=1
752
+ pad=1
753
+ filters=256
754
+ activation=leaky
755
+
756
+ [convolutional]
757
+ batch_normalize=1
758
+ filters=128
759
+ size=1
760
+ stride=1
761
+ pad=1
762
+ activation=leaky
763
+
764
+ [convolutional]
765
+ batch_normalize=1
766
+ size=3
767
+ stride=1
768
+ pad=1
769
+ filters=256
770
+ activation=leaky
771
+
772
+ [convolutional]
773
+ size=1
774
+ stride=1
775
+ pad=1
776
+ filters=1818
777
+ activation=linear
778
+
779
+
780
+ [yolo]
781
+ mask = 0,1,2
782
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783
+ classes=601
784
+ num=9
785
+ jitter=.3
786
+ ignore_thresh = .7
787
+ truth_thresh = 1
788
+ random=1
789
+
darknet.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PyTorch implementation of Darknet
2
+ # This is a custom, hard-coded version of darknet with
3
+ # YOLOv3 implementation for openimages database. This
4
+ # was written to test viability of implementing YOLO
5
+ # for face detection followed by emotion / sentiment
6
+ # analysis.
7
+ #
8
+ # Configuration, weights and data are hardcoded.
9
+ # Additional options include, ability to create
10
+ # subset of data with faces exracted for labelling.
11
+ #
12
+ # Author : Saikiran Tharimena
13
+ # Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
14
+ # Project : Emotion / Sentiment Detection from news images
15
+ # Date : 12 September 2022
16
+ # Version : v0.1
17
+ #
18
+ # (C) Schibsted ASA
19
+
20
+ # Libraries
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+ from torch.autograd import Variable
25
+ import numpy as np
26
+ from utils import *
27
+
28
+
29
+ def parse_cfg(cfgfile):
30
+ """
31
+ Takes a configuration file
32
+
33
+ Returns a list of blocks. Each blocks describes a block in the neural
34
+ network to be built. Block is represented as a dictionary in the list
35
+
36
+ """
37
+
38
+ file = open(cfgfile, 'r')
39
+ lines = file.read().split('\n') # store the lines in a list
40
+ lines = [x for x in lines if len(x) > 0] # get read of the empty lines
41
+ lines = [x for x in lines if x[0] != '#'] # get rid of comments
42
+ lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
43
+
44
+ block = {}
45
+ blocks = []
46
+
47
+ for line in lines:
48
+ if line[0] == "[": # This marks the start of a new block
49
+ if len(block) != 0: # If block is not empty, implies it is storing values of previous block.
50
+ blocks.append(block) # add it the blocks list
51
+ block = {} # re-init the block
52
+ block["type"] = line[1:-1].rstrip()
53
+ else:
54
+ key,value = line.split("=")
55
+ block[key.rstrip()] = value.lstrip()
56
+ blocks.append(block)
57
+
58
+ return blocks
59
+
60
+
61
+ class EmptyLayer(nn.Module):
62
+ def __init__(self):
63
+ super(EmptyLayer, self).__init__()
64
+
65
+
66
+ class DetectionLayer(nn.Module):
67
+ def __init__(self, anchors):
68
+ super(DetectionLayer, self).__init__()
69
+ self.anchors = anchors
70
+
71
+
72
+ def create_modules(blocks):
73
+ net_info = blocks[0] #Captures the information about the input and pre-processing
74
+ module_list = nn.ModuleList()
75
+ prev_filters = 3
76
+ output_filters = []
77
+
78
+ for index, x in enumerate(blocks[1:]):
79
+ module = nn.Sequential()
80
+
81
+ #check the type of block
82
+ #create a new module for the block
83
+ #append to module_list
84
+
85
+ #If it's a convolutional layer
86
+ if (x["type"] == "convolutional"):
87
+ #Get the info about the layer
88
+ activation = x["activation"]
89
+ try:
90
+ batch_normalize = int(x["batch_normalize"])
91
+ bias = False
92
+ except:
93
+ batch_normalize = 0
94
+ bias = True
95
+
96
+ filters= int(x["filters"])
97
+ padding = int(x["pad"])
98
+ kernel_size = int(x["size"])
99
+ stride = int(x["stride"])
100
+
101
+ if padding:
102
+ pad = (kernel_size - 1) // 2
103
+ else:
104
+ pad = 0
105
+
106
+ #Add the convolutional layer
107
+ conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
108
+ module.add_module("conv_{0}".format(index), conv)
109
+
110
+ #Add the Batch Norm Layer
111
+ if batch_normalize:
112
+ bn = nn.BatchNorm2d(filters)
113
+ module.add_module("batch_norm_{0}".format(index), bn)
114
+
115
+ #Check the activation.
116
+ #It is either Linear or a Leaky ReLU for YOLO
117
+ if activation == "leaky":
118
+ activn = nn.LeakyReLU(0.1, inplace = True)
119
+ module.add_module("leaky_{0}".format(index), activn)
120
+
121
+ #If it's an upsampling layer
122
+ #We use Bilinear2dUpsampling
123
+ elif (x["type"] == "upsample"):
124
+ stride = int(x["stride"])
125
+ upsample = nn.Upsample(scale_factor = 2, mode = "nearest")
126
+ module.add_module("upsample_{}".format(index), upsample)
127
+
128
+ #If it is a route layer
129
+ elif (x["type"] == "route"):
130
+ x["layers"] = x["layers"].split(',')
131
+ #Start of a route
132
+ start = int(x["layers"][0])
133
+ #end, if there exists one.
134
+ try:
135
+ end = int(x["layers"][1])
136
+ except:
137
+ end = 0
138
+ #Positive anotation
139
+ if start > 0:
140
+ start = start - index
141
+ if end > 0:
142
+ end = end - index
143
+ route = EmptyLayer()
144
+ module.add_module("route_{0}".format(index), route)
145
+ if end < 0:
146
+ filters = output_filters[index + start] + output_filters[index + end]
147
+ else:
148
+ filters= output_filters[index + start]
149
+
150
+ #shortcut corresponds to skip connection
151
+ elif x["type"] == "shortcut":
152
+ shortcut = EmptyLayer()
153
+ module.add_module("shortcut_{}".format(index), shortcut)
154
+
155
+ #Yolo is the detection layer
156
+ elif x["type"] == "yolo":
157
+ mask = x["mask"].split(",")
158
+ mask = [int(x) for x in mask]
159
+
160
+ anchors = x["anchors"].split(",")
161
+ anchors = [int(a) for a in anchors]
162
+ anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
163
+ anchors = [anchors[i] for i in mask]
164
+
165
+ detection = DetectionLayer(anchors)
166
+ module.add_module("Detection_{}".format(index), detection)
167
+
168
+ module_list.append(module)
169
+ prev_filters = filters
170
+ output_filters.append(filters)
171
+
172
+ return (net_info, module_list)
173
+
174
+ class Darknet(nn.Module):
175
+ def __init__(self, cfgfile):
176
+ super(Darknet, self).__init__()
177
+ self.blocks = parse_cfg(cfgfile)
178
+ self.net_info, self.module_list = create_modules(self.blocks)
179
+
180
+ def forward(self, x, CUDA):
181
+ modules = self.blocks[1:]
182
+ outputs = {} #We cache the outputs for the route layer
183
+
184
+ write = 0
185
+ for i, module in enumerate(modules):
186
+ module_type = (module["type"])
187
+
188
+ if module_type == "convolutional" or module_type == "upsample":
189
+ x = self.module_list[i](x)
190
+
191
+ elif module_type == "route":
192
+ layers = module["layers"]
193
+ layers = [int(a) for a in layers]
194
+
195
+ if (layers[0]) > 0:
196
+ layers[0] = layers[0] - i
197
+
198
+ if len(layers) == 1:
199
+ x = outputs[i + (layers[0])]
200
+
201
+ else:
202
+ if (layers[1]) > 0:
203
+ layers[1] = layers[1] - i
204
+
205
+ map1 = outputs[i + layers[0]]
206
+ map2 = outputs[i + layers[1]]
207
+ x = torch.cat((map1, map2), 1)
208
+
209
+
210
+ elif module_type == "shortcut":
211
+ from_ = int(module["from"])
212
+ x = outputs[i-1] + outputs[i+from_]
213
+
214
+ elif module_type == 'yolo':
215
+ anchors = self.module_list[i][0].anchors
216
+ #Get the input dimensions
217
+ inp_dim = int (self.net_info["height"])
218
+
219
+ #Get the number of classes
220
+ num_classes = int (module["classes"])
221
+
222
+ #Transform
223
+ x = x.data
224
+ x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
225
+ if not write: #if no collector has been intialised.
226
+ detections = x
227
+ write = 1
228
+
229
+ else:
230
+ detections = torch.cat((detections, x), 1)
231
+
232
+ outputs[i] = x
233
+
234
+ return detections
235
+
236
+
237
+ def load_weights(self, weightfile):
238
+ #Open the weights file
239
+ fp = open(weightfile, "rb")
240
+
241
+ #The first 5 values are header information
242
+ # 1. Major version number
243
+ # 2. Minor Version Number
244
+ # 3. Subversion number
245
+ # 4,5. Images seen by the network (during training)
246
+ header = np.fromfile(fp, dtype = np.int32, count = 5)
247
+ self.header = torch.from_numpy(header)
248
+ self.seen = self.header[3]
249
+
250
+ weights = np.fromfile(fp, dtype = np.float32)
251
+
252
+ ptr = 0
253
+ for i in range(len(self.module_list)):
254
+ module_type = self.blocks[i + 1]["type"]
255
+
256
+ #If module_type is convolutional load weights
257
+ #Otherwise ignore.
258
+
259
+ if module_type == "convolutional":
260
+ model = self.module_list[i]
261
+ try:
262
+ batch_normalize = int(self.blocks[i+1]["batch_normalize"])
263
+ except:
264
+ batch_normalize = 0
265
+
266
+ conv = model[0]
267
+
268
+
269
+ if (batch_normalize):
270
+ bn = model[1]
271
+
272
+ #Get the number of weights of Batch Norm Layer
273
+ num_bn_biases = bn.bias.numel()
274
+
275
+ #Load the weights
276
+ bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
277
+ ptr += num_bn_biases
278
+
279
+ bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
280
+ ptr += num_bn_biases
281
+
282
+ bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
283
+ ptr += num_bn_biases
284
+
285
+ bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
286
+ ptr += num_bn_biases
287
+
288
+ #Cast the loaded weights into dims of model weights.
289
+ bn_biases = bn_biases.view_as(bn.bias.data)
290
+ bn_weights = bn_weights.view_as(bn.weight.data)
291
+ bn_running_mean = bn_running_mean.view_as(bn.running_mean)
292
+ bn_running_var = bn_running_var.view_as(bn.running_var)
293
+
294
+ #Copy the data to model
295
+ bn.bias.data.copy_(bn_biases)
296
+ bn.weight.data.copy_(bn_weights)
297
+ bn.running_mean.copy_(bn_running_mean)
298
+ bn.running_var.copy_(bn_running_var)
299
+
300
+ else:
301
+ #Number of biases
302
+ num_biases = conv.bias.numel()
303
+
304
+ #Load the weights
305
+ conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
306
+ ptr = ptr + num_biases
307
+
308
+ #reshape the loaded weights according to the dims of the model weights
309
+ conv_biases = conv_biases.view_as(conv.bias.data)
310
+
311
+ #Finally copy the data
312
+ conv.bias.data.copy_(conv_biases)
313
+
314
+ #Let us load the weights for the Convolutional layers
315
+ num_weights = conv.weight.numel()
316
+
317
+ #Do the same as above for weights
318
+ conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
319
+ ptr = ptr + num_weights
320
+
321
+ conv_weights = conv_weights.view_as(conv.weight.data)
322
+ conv.weight.data.copy_(conv_weights)
detect.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PyTorch implementation of Darknet
2
+ # This is a custom, hard-coded version of darknet with
3
+ # YOLOv3 implementation for openimages database. This
4
+ # was written to test viability of implementing YOLO
5
+ # for face detection followed by emotion / sentiment
6
+ # analysis.
7
+ #
8
+ # Configuration, weights and data are hardcoded.
9
+ # Additional options include, ability to create
10
+ # subset of data with faces exracted for labelling.
11
+ #
12
+ # Author : Saikiran Tharimena
13
+ # Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
14
+ # Project : Emotion / Sentiment Detection from news images
15
+ # Date : 12 September 2022
16
+ # Version : v0.1
17
+ #
18
+ # (C) Schibsted ASA
19
+
20
+ # Libraries
21
+ import os
22
+ import cv2
23
+ import torch
24
+ import numpy as np
25
+ from utils import *
26
+ from darknet import Darknet
27
+ from torch.autograd import Variable
28
+ from torch.cuda import is_available as check_cuda
29
+
30
+ # Parameters
31
+ batch_size = 1
32
+ confidence = 0.25
33
+ nms_thresh = 0.30
34
+ run_cuda = False
35
+
36
+ # CFG Files
37
+ cwd = os.path.dirname(__file__)
38
+ cfg = cwd + '/cfg/yolov3-openimages.cfg'
39
+ data = cwd + '/cfg/openimages.data'
40
+ clsnames= cwd + '/cfg/openimages.names'
41
+ weights = cwd + '/cfg/yolov3-openimages.weights'
42
+
43
+ # Load classes
44
+ num_classes = 601
45
+ classes = load_classes(clsnames)
46
+
47
+ # Set up the neural network
48
+ print('Load Network')
49
+ model = Darknet(cfg)
50
+
51
+ print('Load Weights')
52
+ model.load_weights(weights)
53
+
54
+ print('Successfully loaded Network')
55
+
56
+ # Check CUDA
57
+ if run_cuda:
58
+ CUDA = check_cuda()
59
+ else:
60
+ CUDA = False
61
+
62
+ # Input dimension
63
+ inp_dim = int(model.net_info["height"])
64
+
65
+ # put the model on GPU
66
+ if CUDA:
67
+ model.cuda()
68
+
69
+ # Set the model in evaluation mode
70
+ model.eval()
71
+
72
+ # face detector
73
+ def detect_face(image):
74
+ # Just lazy to update this
75
+ imlist = [image]
76
+
77
+ loaded_ims = [cv2.imread(x) for x in imlist]
78
+
79
+ im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))
80
+ im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims]
81
+ im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
82
+
83
+ leftover = 0
84
+ if (len(im_dim_list) % batch_size):
85
+ leftover = 1
86
+
87
+ if batch_size != 1:
88
+ num_batches = len(imlist) // batch_size + leftover
89
+ im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size,
90
+ len(im_batches))])) for i in range(num_batches)]
91
+
92
+ write = 0
93
+ if CUDA:
94
+ im_dim_list = im_dim_list.cuda()
95
+
96
+ for i, batch in enumerate(im_batches):
97
+ # load the image
98
+
99
+ if CUDA:
100
+ batch = batch.cuda()
101
+ with torch.no_grad():
102
+ prediction = model(Variable(batch), CUDA)
103
+
104
+ prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh)
105
+
106
+ if type(prediction) == int:
107
+
108
+ for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
109
+ im_id = i*batch_size + im_num
110
+
111
+ continue
112
+
113
+ prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist
114
+
115
+ if not write: # If we have't initialised output
116
+ output = prediction
117
+ write = 1
118
+ else:
119
+ output = torch.cat((output, prediction))
120
+
121
+ for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
122
+ im_id = i * batch_size + im_num
123
+ objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
124
+
125
+ if CUDA:
126
+ torch.cuda.synchronize()
127
+
128
+ try:
129
+ output
130
+ except NameError:
131
+ return None
132
+
133
+ im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
134
+
135
+ scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1)
136
+
137
+ output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
138
+ output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
139
+
140
+ output[:, 1:5] /= scaling_factor
141
+
142
+ for i in range(output.shape[0]):
143
+ output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
144
+ output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
145
+
146
+ def get_detections(x, results):
147
+ c1 = [int(y) for y in x[1:3]]
148
+ c2 = [int(y) for y in x[3:5]]
149
+
150
+ det_class = int(x[-1])
151
+ label = "{0}".format(classes[det_class])
152
+
153
+ return (label, tuple(c1 + c2))
154
+
155
+ detections = list(map(lambda x: get_detections(x, loaded_ims), output))
156
+
157
+ if CUDA:
158
+ torch.cuda.empty_cache()
159
+
160
+ return loaded_ims[0], detections
161
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ fastai
3
+ numpy
4
+ opencv-python
utils.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PyTorch implementation of Darknet
2
+ # This is a custom, hard-coded version of darknet with
3
+ # YOLOv3 implementation for openimages database. This
4
+ # was written to test viability of implementing YOLO
5
+ # for face detection followed by emotion / sentiment
6
+ # analysis.
7
+ #
8
+ # Configuration, weights and data are hardcoded.
9
+ # Additional options include, ability to create
10
+ # subset of data with faces exracted for labelling.
11
+ #
12
+ # Author : Saikiran Tharimena
13
+ # Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
14
+ # Project : Emotion / Sentiment Detection from news images
15
+ # Date : 12 September 2022
16
+ # Version : v0.1
17
+ #
18
+ # (C) Schibsted ASA
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from torch.autograd import Variable
24
+ import numpy as np
25
+ import cv2
26
+
27
+
28
+ def unique(tensor):
29
+ tensor_np = tensor.cpu().numpy()
30
+ unique_np = np.unique(tensor_np)
31
+ unique_tensor = torch.from_numpy(unique_np)
32
+
33
+ tensor_res = tensor.new(unique_tensor.shape)
34
+ tensor_res.copy_(unique_tensor)
35
+ return tensor_res
36
+
37
+
38
+ def bbox_iou(box1, box2):
39
+ """
40
+ Returns the IoU of two bounding boxes
41
+
42
+
43
+ """
44
+ #Get the coordinates of bounding boxes
45
+ b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
46
+ b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
47
+
48
+ #get the corrdinates of the intersection rectangle
49
+ inter_rect_x1 = torch.max(b1_x1, b2_x1)
50
+ inter_rect_y1 = torch.max(b1_y1, b2_y1)
51
+ inter_rect_x2 = torch.min(b1_x2, b2_x2)
52
+ inter_rect_y2 = torch.min(b1_y2, b2_y2)
53
+
54
+ #Intersection area
55
+ inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)
56
+
57
+ #Union Area
58
+ b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
59
+ b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
60
+
61
+ iou = inter_area / (b1_area + b2_area - inter_area)
62
+
63
+ return iou
64
+
65
+
66
+ def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
67
+
68
+ batch_size = prediction.size(0)
69
+ stride = inp_dim // prediction.size(2)
70
+ grid_size = inp_dim // stride
71
+ bbox_attrs = 5 + num_classes
72
+ num_anchors = len(anchors)
73
+
74
+ prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
75
+ prediction = prediction.transpose(1,2).contiguous()
76
+ prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
77
+ anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
78
+
79
+ #Sigmoid the centre_X, centre_Y. and object confidencce
80
+ prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
81
+ prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
82
+ prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
83
+
84
+ #Add the center offsets
85
+ grid = np.arange(grid_size)
86
+ a,b = np.meshgrid(grid, grid)
87
+
88
+ x_offset = torch.FloatTensor(a).view(-1,1)
89
+ y_offset = torch.FloatTensor(b).view(-1,1)
90
+
91
+ if CUDA:
92
+ x_offset = x_offset.cuda()
93
+ y_offset = y_offset.cuda()
94
+
95
+ x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
96
+
97
+ prediction[:,:,:2] += x_y_offset
98
+
99
+ #log space transform height and the width
100
+ anchors = torch.FloatTensor(anchors)
101
+
102
+ if CUDA:
103
+ anchors = anchors.cuda()
104
+
105
+ anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
106
+ prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
107
+
108
+ prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
109
+
110
+ prediction[:,:,:4] *= stride
111
+
112
+ return prediction
113
+
114
+
115
+ def write_results(prediction, confidence, num_classes, nms_conf = 0.4):
116
+ conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2)
117
+ prediction = prediction*conf_mask
118
+
119
+ box_corner = prediction.new(prediction.shape)
120
+ box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
121
+ box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
122
+ box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
123
+ box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
124
+ prediction[:,:,:4] = box_corner[:,:,:4]
125
+
126
+ batch_size = prediction.size(0)
127
+
128
+ write = False
129
+
130
+
131
+
132
+ for ind in range(batch_size):
133
+ image_pred = prediction[ind] #image Tensor
134
+ #confidence threshholding
135
+ #NMS
136
+
137
+ max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
138
+ max_conf = max_conf.float().unsqueeze(1)
139
+ max_conf_score = max_conf_score.float().unsqueeze(1)
140
+ seq = (image_pred[:,:5], max_conf, max_conf_score)
141
+ image_pred = torch.cat(seq, 1)
142
+
143
+ non_zero_ind = (torch.nonzero(image_pred[:,4]))
144
+ try:
145
+ image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
146
+ except:
147
+ continue
148
+
149
+ if image_pred_.shape[0] == 0:
150
+ continue
151
+ #
152
+
153
+ #Get the various classes detected in the image
154
+ img_classes = unique(image_pred_[:,-1]) # -1 index holds the class index
155
+
156
+
157
+ for cls in img_classes:
158
+ #perform NMS
159
+
160
+
161
+ #get the detections with one particular class
162
+ cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
163
+ class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
164
+ image_pred_class = image_pred_[class_mask_ind].view(-1,7)
165
+
166
+ #sort the detections such that the entry with the maximum objectness
167
+ #confidence is at the top
168
+ conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
169
+ image_pred_class = image_pred_class[conf_sort_index]
170
+ idx = image_pred_class.size(0) #Number of detections
171
+
172
+ for i in range(idx):
173
+ #Get the IOUs of all boxes that come after the one we are looking at
174
+ #in the loop
175
+ try:
176
+ ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
177
+ except ValueError:
178
+ break
179
+
180
+ except IndexError:
181
+ break
182
+
183
+ #Zero out all the detections that have IoU > treshhold
184
+ iou_mask = (ious < nms_conf).float().unsqueeze(1)
185
+ image_pred_class[i+1:] *= iou_mask
186
+
187
+ #Remove the non-zero entries
188
+ non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
189
+ image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
190
+
191
+ batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) #Repeat the batch_id for as many detections of the class cls in the image
192
+ seq = batch_ind, image_pred_class
193
+
194
+ if not write:
195
+ output = torch.cat(seq,1)
196
+ write = True
197
+ else:
198
+ out = torch.cat(seq,1)
199
+ output = torch.cat((output,out))
200
+
201
+ try:
202
+ return output
203
+ except:
204
+ return 0
205
+
206
+
207
+ def letterbox_image(img, inp_dim):
208
+ '''resize image with unchanged aspect ratio using padding'''
209
+ img_w, img_h = img.shape[1], img.shape[0]
210
+ w, h = inp_dim
211
+ new_w = int(img_w * min(w/img_w, h/img_h))
212
+ new_h = int(img_h * min(w/img_w, h/img_h))
213
+ resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
214
+
215
+ canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
216
+
217
+ canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image
218
+
219
+ return canvas
220
+
221
+
222
+ def prep_image(img, inp_dim):
223
+ """
224
+ Prepare image for inputting to the neural network.
225
+
226
+ Returns a Variable
227
+ """
228
+ img = (letterbox_image(img, (inp_dim, inp_dim)))
229
+ img = img[:,:,::-1].transpose((2,0,1)).copy()
230
+ img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
231
+ return img
232
+
233
+
234
+ def load_classes(namesfile):
235
+ fp = open(namesfile, "r")
236
+ names = fp.read().split("\n")[:-1]
237
+ return names