abhi001vj commited on
Commit
637af2f
β€’
1 Parent(s): 31e192b

added the fixes for local

Browse files
.env CHANGED
@@ -1,2 +1,2 @@
1
- PINECONE_KEY=696a2b15-b4c0-4581-af5d-2d52d0198950
2
- PINECONE_ENV=us-central1-gcp
 
1
+ PINECONE_KEY=
2
+ PINECONE_ENV=
Licenseplate_model.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c9a080781aa7ff722968c944a702983af8a452753edd5ba20719d42349ec7bd
3
- size 71780037
 
 
 
 
app.py CHANGED
@@ -1,5 +1,3 @@
1
- import cv2
2
- import numpy as np
3
  import argparse
4
  import base64
5
  import io
@@ -9,8 +7,7 @@ import sys
9
  import traceback
10
  import uuid
11
  from typing import List, Optional
12
- from PIL import ImageEnhance
13
- import traceback
14
  import cv2
15
  import numpy as np
16
  import pandas as pd
@@ -21,31 +18,20 @@ import torch
21
  import uvicorn
22
  from dotenv import load_dotenv
23
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
24
- from PIL import Image
25
  from pydantic import BaseModel
26
  from sentence_transformers import SentenceTransformer, util
27
- from transformers import (
28
- AutoFeatureExtractor,
29
- AutoModel,
30
- DonutProcessor,
31
- VisionEncoderDecoderModel,
32
- )
33
 
34
  load_dotenv()
35
  pinecone.init(api_key=os.getenv("PINECONE_KEY"), environment=os.getenv("PINECONE_ENV"))
36
- DETECTION_URL = "/object-detection/"
37
- CLASSIFICATION_URL = "/object-classification/"
38
- QUALITY_ASSESSMENT_URL = "/quality-assessment/"
39
- FACE_URL = "/face-anonymization/"
40
- LICENCE_URL = "/licenceplate-anonymization/"
41
- DOCUMENT_QA = "/document-qa/"
42
  IMAGE_SIMILARITY_DEMO = "/find-similar-image/"
43
  IMAGE_SIMILARITY_PINECONE_DEMO = "/find-similar-image-pinecone/"
44
  INDEX_NAME = "imagesearch-demo"
45
  INDEX_DIMENSION = 512
46
  TMP_DIR = "tmp"
47
 
48
-
49
 
50
  def enhance_image(pil_image):
51
  # Convert PIL Image to OpenCV format
@@ -99,353 +85,22 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
99
 
100
  os.makedirs(TMP_DIR, exist_ok=True)
101
 
102
- licence_model = torch.hub.load(
103
- "ultralytics/yolov5", "custom", path="Licenseplate_model.pt", device="cpu", force_reload=True
104
- )
105
- licence_model.cpu()
106
-
107
- detector = cv2.dnn.DetectionModel(
108
- "res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt"
109
- )
110
-
111
- processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
112
- doc_qa_model = VisionEncoderDecoderModel.from_pretrained(
113
- "naver-clova-ix/donut-base-finetuned-docvqa"
114
- )
115
 
116
  device = "cuda" if torch.cuda.is_available() else "cpu"
117
- doc_qa_model.to(device)
118
 
119
 
120
  os.makedirs(TMP_DIR, exist_ok=True)
121
 
122
- model = torch.hub.load(
123
- "ultralytics/yolov5", "custom", path="best.pt", device="cpu", force_reload=True
124
- )
125
- model.cpu()
126
-
127
- classes = [
128
- "gas-distribution-meter",
129
- "gas-distribution-piping",
130
- "gas-distribution-regulator",
131
- "gas-distribution-valve",
132
- ]
133
-
134
- class_to_idx = {
135
- "gas-distribution-meter": 0,
136
- "gas-distribution-piping": 1,
137
- "gas-distribution-regulator": 2,
138
- "gas-distribution-valve": 3,
139
- }
140
-
141
- idx_to_classes = {v: k for k, v in class_to_idx.items()}
142
- modelname = "resnet50d"
143
- model_weights = "best_classifer_model.pt"
144
- num_classes = len(classes)
145
-
146
- classifier_model = timm.create_model(
147
- "resnet50d", pretrained=True, num_classes=num_classes, drop_path_rate=0.05
148
- )
149
- classifier_model.load_state_dict(
150
- torch.load(model_weights, map_location=torch.device("cpu"))["model_state_dict"]
151
- )
152
-
153
- musiq_metric = pyiqa.create_metric("musiq-koniq", device=torch.device("cpu"))
154
- image_sim_model = SentenceTransformer("clip-ViT-B-32")
155
-
156
 
157
- # model_ckpt = "nateraw/vit-base-beans"
158
- # extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)
159
- # image_sim_model = AutoModel.from_pretrained(model_ckpt)
160
 
161
 
162
  app = FastAPI(title="CV Demos")
163
 
164
- # Define the Response
165
- class Prediction(BaseModel):
166
- filename: str
167
- contenttype: str
168
- prediction: List[float] = []
169
-
170
-
171
- # define response
172
- @app.get("/")
173
- def root_route():
174
- return {"error": f"Use GET {DETECTION_URL} instead of the root route!"}
175
-
176
-
177
- @app.post(
178
- DETECTION_URL,
179
- )
180
- async def predict(file: UploadFile = File(...), quality_check: bool = False):
181
- try:
182
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
183
- if not extension:
184
- return "Image must be jpg or png format!"
185
- # read image contain
186
- contents = await file.read()
187
- pil_image = Image.open(io.BytesIO(contents))
188
- if quality_check:
189
- print("RUNNING QUALITY CEHCK BEFORE OBJEFCT DETECTION!!!")
190
- tmp_file = f"{TMP_DIR}/tmp.png"
191
- pil_image.save(tmp_file)
192
- score = musiq_metric(tmp_file)
193
- if score < 50:
194
- return {
195
- "Error": "Image quality is not sufficient enough to be considered for object detection"
196
- }
197
-
198
- results = model(pil_image, size=640) # reduce size=320 for faster inference
199
- return results.pandas().xyxy[0].to_json(orient="records")
200
- except:
201
- e = sys.exc_info()[1]
202
- raise HTTPException(status_code=500, detail=str(e))
203
-
204
-
205
- @app.post(CLASSIFICATION_URL)
206
- async def classify(file: UploadFile = File(...)):
207
- try:
208
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
209
- if not extension:
210
- return "Image must be jpg or png format!"
211
- # read image contain
212
- contents = await file.read()
213
- pil_image = Image.open(io.BytesIO(contents))
214
- data_mean = (0.485, 0.456, 0.406)
215
- data_std = (0.229, 0.224, 0.225)
216
- image_size = (224, 224)
217
- eval_transforms = timm.data.create_transform(
218
- input_size=image_size, mean=data_mean, std=data_std
219
- )
220
- eval_transforms(pil_image).unsqueeze(dim=0).shape
221
- classifier_model.eval()
222
- print("RUNNING Image Classification!!!")
223
- max_class_idx = np.argmax(
224
- classifier_model(eval_transforms(pil_image).unsqueeze(dim=0)).detach().numpy()
225
- )
226
- predicted_class = idx_to_classes[max_class_idx]
227
- print(f"Predicted Class idx: {max_class_idx} with name : {predicted_class}")
228
- return {"object": predicted_class}
229
-
230
- except:
231
- e = sys.exc_info()[1]
232
- raise HTTPException(status_code=500, detail=str(e))
233
-
234
-
235
- @app.post(QUALITY_ASSESSMENT_URL)
236
- async def quality_check(file: UploadFile = File(...)):
237
- try:
238
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
239
- if not extension:
240
- return "Image must be jpg or png format!"
241
- # read image contain
242
- contents = await file.read()
243
- pil_image = Image.open(io.BytesIO(contents))
244
- tmp_file = f"{TMP_DIR}/tmp.png"
245
- pil_image.save(tmp_file)
246
- score = musiq_metric(tmp_file).detach().numpy().tolist()
247
- return {"score": score}
248
-
249
- except:
250
- e = sys.exc_info()[1]
251
- raise HTTPException(status_code=500, detail=str(e))
252
-
253
-
254
- def anonymize_simple(image, factor=3.0):
255
- # automatically determine the size of the blurring kernel based
256
- # on the spatial dimensions of the input image
257
- (h, w) = image.shape[:2]
258
- kW = int(w / factor)
259
- kH = int(h / factor)
260
- # ensure the width of the kernel is odd
261
- if kW % 2 == 0:
262
- kW -= 1
263
- # ensure the height of the kernel is odd
264
- if kH % 2 == 0:
265
- kH -= 1
266
- # apply a Gaussian blur to the input image using our computed
267
- # kernel size
268
- return cv2.GaussianBlur(image, (kW, kH), 0)
269
-
270
-
271
- def anonymize_pixelate(image, blocks=3):
272
- # divide the input image into NxN blocks
273
- (h, w) = image.shape[:2]
274
- xSteps = np.linspace(0, w, blocks + 1, dtype="int")
275
- ySteps = np.linspace(0, h, blocks + 1, dtype="int")
276
- # loop over the blocks in both the x and y direction
277
- for i in range(1, len(ySteps)):
278
- for j in range(1, len(xSteps)):
279
- # compute the starting and ending (x, y)-coordinates
280
- # for the current block
281
- startX = xSteps[j - 1]
282
- startY = ySteps[i - 1]
283
- endX = xSteps[j]
284
- endY = ySteps[i]
285
- # extract the ROI using NumPy array slicing, compute the
286
- # mean of the ROI, and then draw a rectangle with the
287
- # mean RGB values over the ROI in the original image
288
- roi = image[startY:endY, startX:endX]
289
- (B, G, R) = [int(x) for x in cv2.mean(roi)[:3]]
290
- cv2.rectangle(image, (startX, startY), (endX, endY), (B, G, R), -1)
291
- # return the pixelated blurred image
292
- return image
293
-
294
 
295
  # define response
296
  @app.get("/")
297
  def root_route():
298
- return {"error": f"Use GET {FACE_URL} or {LICENCE_URL} instead of the root route!"}
299
-
300
-
301
- @app.post(
302
- FACE_URL,
303
- )
304
- async def face_anonymize(
305
- file: UploadFile = File(...), blur_type="simple", quality_check: bool = False
306
- ):
307
- """
308
- https://pyimagesearch.com/2020/04/06/blur-and-anonymize-faces-with-opencv-and-python/
309
- """
310
- try:
311
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
312
- if not extension:
313
- return "Image must be jpg or png format!"
314
- # read image contain
315
- contents = await file.read()
316
- pil_image = Image.open(io.BytesIO(contents)).convert("RGB")
317
- detector = cv2.dnn.DetectionModel(
318
- "res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt"
319
- )
320
- open_cv_image = np.array(pil_image)
321
- # Convert RGB to BGR
322
- open_cv_image = open_cv_image[:, :, ::-1].copy()
323
- (h, w) = open_cv_image.shape[:2]
324
- # Getting the detections
325
- detections = detector.detect(open_cv_image)
326
- if len(detections[2]) > 0:
327
- for face in detections[2]:
328
- (x, y, w, h) = face.astype("int")
329
- # extract the face ROI
330
-
331
- face = open_cv_image[y : y + h, x : x + w]
332
- if blur_type == "simple":
333
- face = anonymize_simple(face)
334
- else:
335
- face = anonymize_pixelate(face)
336
- open_cv_image[y : y + h, x : x + w] = face
337
-
338
- _, encoded_img = cv2.imencode(".PNG", open_cv_image)
339
-
340
- encoded_img = base64.b64encode(encoded_img)
341
- return {
342
- "filename": file.filename,
343
- "dimensions": str(open_cv_image.shape),
344
- "encoded_img": encoded_img,
345
- }
346
- except:
347
- e = sys.exc_info()[1]
348
- print(traceback.format_exc())
349
- raise HTTPException(status_code=500, detail=str(e))
350
-
351
-
352
- @app.post(LICENCE_URL)
353
- async def licence_anonymize(file: UploadFile = File(...), blur_type="simple"):
354
- """https://www.kaggle.com/code/gowrishankarp/license-plate-detection-yolov5-pytesseract/notebook#Visualize"""
355
- try:
356
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
357
- if not extension:
358
- return "Image must be jpg or png format!"
359
- # read image contain
360
- contents = await file.read()
361
- pil_image = Image.open(io.BytesIO(contents))
362
- results = licence_model(pil_image, size=640) # reduce size=320 for faster inference
363
- pil_image = pil_image.convert("RGB")
364
- open_cv_image = np.array(pil_image)
365
- open_cv_image = open_cv_image[:, :, ::-1].copy()
366
- df = results.pandas().xyxy[0]
367
- for i, row in df.iterrows():
368
- xmin = int(row["xmin"])
369
- ymin = int(row["ymin"])
370
- xmax = int(row["xmax"])
371
- ymax = int(row["ymax"])
372
- licence = open_cv_image[ymin:ymax, xmin:xmax]
373
- if blur_type == "simple":
374
- licence = anonymize_simple(licence)
375
- else:
376
- licence = anonymize_pixelate(licence)
377
- open_cv_image[ymin:ymax, xmin:xmax] = licence
378
-
379
- _, encoded_img = cv2.imencode(".PNG", open_cv_image)
380
-
381
- encoded_img = base64.b64encode(encoded_img)
382
- return {
383
- "filename": file.filename,
384
- "dimensions": str(open_cv_image.shape),
385
- "encoded_img": encoded_img,
386
- }
387
-
388
- except:
389
- e = sys.exc_info()[1]
390
- raise HTTPException(status_code=500, detail=str(e))
391
-
392
-
393
- def process_document(image, question):
394
- # prepare encoder inputs
395
- pixel_values = processor(image, return_tensors="pt").pixel_values
396
-
397
- # prepare decoder inputs
398
- task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
399
- prompt = task_prompt.replace("{user_input}", question)
400
- decoder_input_ids = processor.tokenizer(
401
- prompt, add_special_tokens=False, return_tensors="pt"
402
- ).input_ids
403
-
404
- # generate answer
405
- outputs = doc_qa_model.generate(
406
- pixel_values.to(device),
407
- decoder_input_ids=decoder_input_ids.to(device),
408
- max_length=doc_qa_model.decoder.config.max_position_embeddings,
409
- early_stopping=True,
410
- pad_token_id=processor.tokenizer.pad_token_id,
411
- eos_token_id=processor.tokenizer.eos_token_id,
412
- use_cache=True,
413
- num_beams=1,
414
- bad_words_ids=[[processor.tokenizer.unk_token_id]],
415
- return_dict_in_generate=True,
416
- )
417
-
418
- # postprocess
419
- sequence = processor.batch_decode(outputs.sequences)[0]
420
- sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(
421
- processor.tokenizer.pad_token, ""
422
- )
423
- sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
424
-
425
- return processor.token2json(sequence)
426
-
427
-
428
- @app.post(DOCUMENT_QA)
429
- async def document_qa(question: str = Form(...), file: UploadFile = File(...)):
430
-
431
- try:
432
- extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
433
- if not extension:
434
- return "Image must be jpg or png format!"
435
- # read image contain
436
- contents = await file.read()
437
- pil_image = Image.open(io.BytesIO(contents))
438
- # tmp_file = f"{TMP_DIR}/tmp.png"
439
- # pil_image.save(tmp_file)
440
- # answer_git_large = generate_answer_git(git_processor_large, git_model_large, image, question)
441
-
442
- answer = process_document(pil_image, question)["answer"]
443
-
444
- return {"answer": answer}
445
-
446
- except:
447
- e = sys.exc_info()[1]
448
- raise HTTPException(status_code=500, detail=str(e))
449
 
450
 
451
  @app.post(IMAGE_SIMILARITY_DEMO)
 
 
 
1
  import argparse
2
  import base64
3
  import io
 
7
  import traceback
8
  import uuid
9
  from typing import List, Optional
10
+
 
11
  import cv2
12
  import numpy as np
13
  import pandas as pd
 
18
  import uvicorn
19
  from dotenv import load_dotenv
20
  from fastapi import FastAPI, File, Form, HTTPException, UploadFile
21
+ from PIL import Image, ImageEnhance
22
  from pydantic import BaseModel
23
  from sentence_transformers import SentenceTransformer, util
 
 
 
 
 
 
24
 
25
  load_dotenv()
26
  pinecone.init(api_key=os.getenv("PINECONE_KEY"), environment=os.getenv("PINECONE_ENV"))
27
+
 
 
 
 
 
28
  IMAGE_SIMILARITY_DEMO = "/find-similar-image/"
29
  IMAGE_SIMILARITY_PINECONE_DEMO = "/find-similar-image-pinecone/"
30
  INDEX_NAME = "imagesearch-demo"
31
  INDEX_DIMENSION = 512
32
  TMP_DIR = "tmp"
33
 
34
+ image_sim_model = SentenceTransformer("clip-ViT-B-32")
35
 
36
  def enhance_image(pil_image):
37
  # Convert PIL Image to OpenCV format
 
85
 
86
  os.makedirs(TMP_DIR, exist_ok=True)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
90
 
91
 
92
  os.makedirs(TMP_DIR, exist_ok=True)
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
 
95
 
96
 
97
  app = FastAPI(title="CV Demos")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # define response
101
  @app.get("/")
102
  def root_route():
103
+ return {"error": f"Use GET {IMAGE_SIMILARITY_PINECONE_DEMO} instead of the root route!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
  @app.post(IMAGE_SIMILARITY_DEMO)
best.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8faa2592e29248e58453cb031e536bd96f2929d9768bbd3c78ea54944f045db
3
- size 14447677
 
 
 
 
best_classifer_model.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e5c0f63fbe8f8349ceda742cc6c7d333c1a2ae443b6f7aa1d100859d59322a7
3
- size 377080432
 
 
 
 
deploy.prototxt DELETED
@@ -1,1789 +0,0 @@
1
- input: "data"
2
- input_shape {
3
- dim: 1
4
- dim: 3
5
- dim: 300
6
- dim: 300
7
- }
8
-
9
- layer {
10
- name: "data_bn"
11
- type: "BatchNorm"
12
- bottom: "data"
13
- top: "data_bn"
14
- param {
15
- lr_mult: 0.0
16
- }
17
- param {
18
- lr_mult: 0.0
19
- }
20
- param {
21
- lr_mult: 0.0
22
- }
23
- }
24
- layer {
25
- name: "data_scale"
26
- type: "Scale"
27
- bottom: "data_bn"
28
- top: "data_bn"
29
- param {
30
- lr_mult: 1.0
31
- decay_mult: 1.0
32
- }
33
- param {
34
- lr_mult: 2.0
35
- decay_mult: 1.0
36
- }
37
- scale_param {
38
- bias_term: true
39
- }
40
- }
41
- layer {
42
- name: "conv1_h"
43
- type: "Convolution"
44
- bottom: "data_bn"
45
- top: "conv1_h"
46
- param {
47
- lr_mult: 1.0
48
- decay_mult: 1.0
49
- }
50
- param {
51
- lr_mult: 2.0
52
- decay_mult: 1.0
53
- }
54
- convolution_param {
55
- num_output: 32
56
- pad: 3
57
- kernel_size: 7
58
- stride: 2
59
- weight_filler {
60
- type: "msra"
61
- variance_norm: FAN_OUT
62
- }
63
- bias_filler {
64
- type: "constant"
65
- value: 0.0
66
- }
67
- }
68
- }
69
- layer {
70
- name: "conv1_bn_h"
71
- type: "BatchNorm"
72
- bottom: "conv1_h"
73
- top: "conv1_h"
74
- param {
75
- lr_mult: 0.0
76
- }
77
- param {
78
- lr_mult: 0.0
79
- }
80
- param {
81
- lr_mult: 0.0
82
- }
83
- }
84
- layer {
85
- name: "conv1_scale_h"
86
- type: "Scale"
87
- bottom: "conv1_h"
88
- top: "conv1_h"
89
- param {
90
- lr_mult: 1.0
91
- decay_mult: 1.0
92
- }
93
- param {
94
- lr_mult: 2.0
95
- decay_mult: 1.0
96
- }
97
- scale_param {
98
- bias_term: true
99
- }
100
- }
101
- layer {
102
- name: "conv1_relu"
103
- type: "ReLU"
104
- bottom: "conv1_h"
105
- top: "conv1_h"
106
- }
107
- layer {
108
- name: "conv1_pool"
109
- type: "Pooling"
110
- bottom: "conv1_h"
111
- top: "conv1_pool"
112
- pooling_param {
113
- kernel_size: 3
114
- stride: 2
115
- }
116
- }
117
- layer {
118
- name: "layer_64_1_conv1_h"
119
- type: "Convolution"
120
- bottom: "conv1_pool"
121
- top: "layer_64_1_conv1_h"
122
- param {
123
- lr_mult: 1.0
124
- decay_mult: 1.0
125
- }
126
- convolution_param {
127
- num_output: 32
128
- bias_term: false
129
- pad: 1
130
- kernel_size: 3
131
- stride: 1
132
- weight_filler {
133
- type: "msra"
134
- }
135
- bias_filler {
136
- type: "constant"
137
- value: 0.0
138
- }
139
- }
140
- }
141
- layer {
142
- name: "layer_64_1_bn2_h"
143
- type: "BatchNorm"
144
- bottom: "layer_64_1_conv1_h"
145
- top: "layer_64_1_conv1_h"
146
- param {
147
- lr_mult: 0.0
148
- }
149
- param {
150
- lr_mult: 0.0
151
- }
152
- param {
153
- lr_mult: 0.0
154
- }
155
- }
156
- layer {
157
- name: "layer_64_1_scale2_h"
158
- type: "Scale"
159
- bottom: "layer_64_1_conv1_h"
160
- top: "layer_64_1_conv1_h"
161
- param {
162
- lr_mult: 1.0
163
- decay_mult: 1.0
164
- }
165
- param {
166
- lr_mult: 2.0
167
- decay_mult: 1.0
168
- }
169
- scale_param {
170
- bias_term: true
171
- }
172
- }
173
- layer {
174
- name: "layer_64_1_relu2"
175
- type: "ReLU"
176
- bottom: "layer_64_1_conv1_h"
177
- top: "layer_64_1_conv1_h"
178
- }
179
- layer {
180
- name: "layer_64_1_conv2_h"
181
- type: "Convolution"
182
- bottom: "layer_64_1_conv1_h"
183
- top: "layer_64_1_conv2_h"
184
- param {
185
- lr_mult: 1.0
186
- decay_mult: 1.0
187
- }
188
- convolution_param {
189
- num_output: 32
190
- bias_term: false
191
- pad: 1
192
- kernel_size: 3
193
- stride: 1
194
- weight_filler {
195
- type: "msra"
196
- }
197
- bias_filler {
198
- type: "constant"
199
- value: 0.0
200
- }
201
- }
202
- }
203
- layer {
204
- name: "layer_64_1_sum"
205
- type: "Eltwise"
206
- bottom: "layer_64_1_conv2_h"
207
- bottom: "conv1_pool"
208
- top: "layer_64_1_sum"
209
- }
210
- layer {
211
- name: "layer_128_1_bn1_h"
212
- type: "BatchNorm"
213
- bottom: "layer_64_1_sum"
214
- top: "layer_128_1_bn1_h"
215
- param {
216
- lr_mult: 0.0
217
- }
218
- param {
219
- lr_mult: 0.0
220
- }
221
- param {
222
- lr_mult: 0.0
223
- }
224
- }
225
- layer {
226
- name: "layer_128_1_scale1_h"
227
- type: "Scale"
228
- bottom: "layer_128_1_bn1_h"
229
- top: "layer_128_1_bn1_h"
230
- param {
231
- lr_mult: 1.0
232
- decay_mult: 1.0
233
- }
234
- param {
235
- lr_mult: 2.0
236
- decay_mult: 1.0
237
- }
238
- scale_param {
239
- bias_term: true
240
- }
241
- }
242
- layer {
243
- name: "layer_128_1_relu1"
244
- type: "ReLU"
245
- bottom: "layer_128_1_bn1_h"
246
- top: "layer_128_1_bn1_h"
247
- }
248
- layer {
249
- name: "layer_128_1_conv1_h"
250
- type: "Convolution"
251
- bottom: "layer_128_1_bn1_h"
252
- top: "layer_128_1_conv1_h"
253
- param {
254
- lr_mult: 1.0
255
- decay_mult: 1.0
256
- }
257
- convolution_param {
258
- num_output: 128
259
- bias_term: false
260
- pad: 1
261
- kernel_size: 3
262
- stride: 2
263
- weight_filler {
264
- type: "msra"
265
- }
266
- bias_filler {
267
- type: "constant"
268
- value: 0.0
269
- }
270
- }
271
- }
272
- layer {
273
- name: "layer_128_1_bn2"
274
- type: "BatchNorm"
275
- bottom: "layer_128_1_conv1_h"
276
- top: "layer_128_1_conv1_h"
277
- param {
278
- lr_mult: 0.0
279
- }
280
- param {
281
- lr_mult: 0.0
282
- }
283
- param {
284
- lr_mult: 0.0
285
- }
286
- }
287
- layer {
288
- name: "layer_128_1_scale2"
289
- type: "Scale"
290
- bottom: "layer_128_1_conv1_h"
291
- top: "layer_128_1_conv1_h"
292
- param {
293
- lr_mult: 1.0
294
- decay_mult: 1.0
295
- }
296
- param {
297
- lr_mult: 2.0
298
- decay_mult: 1.0
299
- }
300
- scale_param {
301
- bias_term: true
302
- }
303
- }
304
- layer {
305
- name: "layer_128_1_relu2"
306
- type: "ReLU"
307
- bottom: "layer_128_1_conv1_h"
308
- top: "layer_128_1_conv1_h"
309
- }
310
- layer {
311
- name: "layer_128_1_conv2"
312
- type: "Convolution"
313
- bottom: "layer_128_1_conv1_h"
314
- top: "layer_128_1_conv2"
315
- param {
316
- lr_mult: 1.0
317
- decay_mult: 1.0
318
- }
319
- convolution_param {
320
- num_output: 128
321
- bias_term: false
322
- pad: 1
323
- kernel_size: 3
324
- stride: 1
325
- weight_filler {
326
- type: "msra"
327
- }
328
- bias_filler {
329
- type: "constant"
330
- value: 0.0
331
- }
332
- }
333
- }
334
- layer {
335
- name: "layer_128_1_conv_expand_h"
336
- type: "Convolution"
337
- bottom: "layer_128_1_bn1_h"
338
- top: "layer_128_1_conv_expand_h"
339
- param {
340
- lr_mult: 1.0
341
- decay_mult: 1.0
342
- }
343
- convolution_param {
344
- num_output: 128
345
- bias_term: false
346
- pad: 0
347
- kernel_size: 1
348
- stride: 2
349
- weight_filler {
350
- type: "msra"
351
- }
352
- bias_filler {
353
- type: "constant"
354
- value: 0.0
355
- }
356
- }
357
- }
358
- layer {
359
- name: "layer_128_1_sum"
360
- type: "Eltwise"
361
- bottom: "layer_128_1_conv2"
362
- bottom: "layer_128_1_conv_expand_h"
363
- top: "layer_128_1_sum"
364
- }
365
- layer {
366
- name: "layer_256_1_bn1"
367
- type: "BatchNorm"
368
- bottom: "layer_128_1_sum"
369
- top: "layer_256_1_bn1"
370
- param {
371
- lr_mult: 0.0
372
- }
373
- param {
374
- lr_mult: 0.0
375
- }
376
- param {
377
- lr_mult: 0.0
378
- }
379
- }
380
- layer {
381
- name: "layer_256_1_scale1"
382
- type: "Scale"
383
- bottom: "layer_256_1_bn1"
384
- top: "layer_256_1_bn1"
385
- param {
386
- lr_mult: 1.0
387
- decay_mult: 1.0
388
- }
389
- param {
390
- lr_mult: 2.0
391
- decay_mult: 1.0
392
- }
393
- scale_param {
394
- bias_term: true
395
- }
396
- }
397
- layer {
398
- name: "layer_256_1_relu1"
399
- type: "ReLU"
400
- bottom: "layer_256_1_bn1"
401
- top: "layer_256_1_bn1"
402
- }
403
- layer {
404
- name: "layer_256_1_conv1"
405
- type: "Convolution"
406
- bottom: "layer_256_1_bn1"
407
- top: "layer_256_1_conv1"
408
- param {
409
- lr_mult: 1.0
410
- decay_mult: 1.0
411
- }
412
- convolution_param {
413
- num_output: 256
414
- bias_term: false
415
- pad: 1
416
- kernel_size: 3
417
- stride: 2
418
- weight_filler {
419
- type: "msra"
420
- }
421
- bias_filler {
422
- type: "constant"
423
- value: 0.0
424
- }
425
- }
426
- }
427
- layer {
428
- name: "layer_256_1_bn2"
429
- type: "BatchNorm"
430
- bottom: "layer_256_1_conv1"
431
- top: "layer_256_1_conv1"
432
- param {
433
- lr_mult: 0.0
434
- }
435
- param {
436
- lr_mult: 0.0
437
- }
438
- param {
439
- lr_mult: 0.0
440
- }
441
- }
442
- layer {
443
- name: "layer_256_1_scale2"
444
- type: "Scale"
445
- bottom: "layer_256_1_conv1"
446
- top: "layer_256_1_conv1"
447
- param {
448
- lr_mult: 1.0
449
- decay_mult: 1.0
450
- }
451
- param {
452
- lr_mult: 2.0
453
- decay_mult: 1.0
454
- }
455
- scale_param {
456
- bias_term: true
457
- }
458
- }
459
- layer {
460
- name: "layer_256_1_relu2"
461
- type: "ReLU"
462
- bottom: "layer_256_1_conv1"
463
- top: "layer_256_1_conv1"
464
- }
465
- layer {
466
- name: "layer_256_1_conv2"
467
- type: "Convolution"
468
- bottom: "layer_256_1_conv1"
469
- top: "layer_256_1_conv2"
470
- param {
471
- lr_mult: 1.0
472
- decay_mult: 1.0
473
- }
474
- convolution_param {
475
- num_output: 256
476
- bias_term: false
477
- pad: 1
478
- kernel_size: 3
479
- stride: 1
480
- weight_filler {
481
- type: "msra"
482
- }
483
- bias_filler {
484
- type: "constant"
485
- value: 0.0
486
- }
487
- }
488
- }
489
- layer {
490
- name: "layer_256_1_conv_expand"
491
- type: "Convolution"
492
- bottom: "layer_256_1_bn1"
493
- top: "layer_256_1_conv_expand"
494
- param {
495
- lr_mult: 1.0
496
- decay_mult: 1.0
497
- }
498
- convolution_param {
499
- num_output: 256
500
- bias_term: false
501
- pad: 0
502
- kernel_size: 1
503
- stride: 2
504
- weight_filler {
505
- type: "msra"
506
- }
507
- bias_filler {
508
- type: "constant"
509
- value: 0.0
510
- }
511
- }
512
- }
513
- layer {
514
- name: "layer_256_1_sum"
515
- type: "Eltwise"
516
- bottom: "layer_256_1_conv2"
517
- bottom: "layer_256_1_conv_expand"
518
- top: "layer_256_1_sum"
519
- }
520
- layer {
521
- name: "layer_512_1_bn1"
522
- type: "BatchNorm"
523
- bottom: "layer_256_1_sum"
524
- top: "layer_512_1_bn1"
525
- param {
526
- lr_mult: 0.0
527
- }
528
- param {
529
- lr_mult: 0.0
530
- }
531
- param {
532
- lr_mult: 0.0
533
- }
534
- }
535
- layer {
536
- name: "layer_512_1_scale1"
537
- type: "Scale"
538
- bottom: "layer_512_1_bn1"
539
- top: "layer_512_1_bn1"
540
- param {
541
- lr_mult: 1.0
542
- decay_mult: 1.0
543
- }
544
- param {
545
- lr_mult: 2.0
546
- decay_mult: 1.0
547
- }
548
- scale_param {
549
- bias_term: true
550
- }
551
- }
552
- layer {
553
- name: "layer_512_1_relu1"
554
- type: "ReLU"
555
- bottom: "layer_512_1_bn1"
556
- top: "layer_512_1_bn1"
557
- }
558
- layer {
559
- name: "layer_512_1_conv1_h"
560
- type: "Convolution"
561
- bottom: "layer_512_1_bn1"
562
- top: "layer_512_1_conv1_h"
563
- param {
564
- lr_mult: 1.0
565
- decay_mult: 1.0
566
- }
567
- convolution_param {
568
- num_output: 128
569
- bias_term: false
570
- pad: 1
571
- kernel_size: 3
572
- stride: 1 # 2
573
- weight_filler {
574
- type: "msra"
575
- }
576
- bias_filler {
577
- type: "constant"
578
- value: 0.0
579
- }
580
- }
581
- }
582
- layer {
583
- name: "layer_512_1_bn2_h"
584
- type: "BatchNorm"
585
- bottom: "layer_512_1_conv1_h"
586
- top: "layer_512_1_conv1_h"
587
- param {
588
- lr_mult: 0.0
589
- }
590
- param {
591
- lr_mult: 0.0
592
- }
593
- param {
594
- lr_mult: 0.0
595
- }
596
- }
597
- layer {
598
- name: "layer_512_1_scale2_h"
599
- type: "Scale"
600
- bottom: "layer_512_1_conv1_h"
601
- top: "layer_512_1_conv1_h"
602
- param {
603
- lr_mult: 1.0
604
- decay_mult: 1.0
605
- }
606
- param {
607
- lr_mult: 2.0
608
- decay_mult: 1.0
609
- }
610
- scale_param {
611
- bias_term: true
612
- }
613
- }
614
- layer {
615
- name: "layer_512_1_relu2"
616
- type: "ReLU"
617
- bottom: "layer_512_1_conv1_h"
618
- top: "layer_512_1_conv1_h"
619
- }
620
- layer {
621
- name: "layer_512_1_conv2_h"
622
- type: "Convolution"
623
- bottom: "layer_512_1_conv1_h"
624
- top: "layer_512_1_conv2_h"
625
- param {
626
- lr_mult: 1.0
627
- decay_mult: 1.0
628
- }
629
- convolution_param {
630
- num_output: 256
631
- bias_term: false
632
- pad: 2 # 1
633
- kernel_size: 3
634
- stride: 1
635
- dilation: 2
636
- weight_filler {
637
- type: "msra"
638
- }
639
- bias_filler {
640
- type: "constant"
641
- value: 0.0
642
- }
643
- }
644
- }
645
- layer {
646
- name: "layer_512_1_conv_expand_h"
647
- type: "Convolution"
648
- bottom: "layer_512_1_bn1"
649
- top: "layer_512_1_conv_expand_h"
650
- param {
651
- lr_mult: 1.0
652
- decay_mult: 1.0
653
- }
654
- convolution_param {
655
- num_output: 256
656
- bias_term: false
657
- pad: 0
658
- kernel_size: 1
659
- stride: 1 # 2
660
- weight_filler {
661
- type: "msra"
662
- }
663
- bias_filler {
664
- type: "constant"
665
- value: 0.0
666
- }
667
- }
668
- }
669
- layer {
670
- name: "layer_512_1_sum"
671
- type: "Eltwise"
672
- bottom: "layer_512_1_conv2_h"
673
- bottom: "layer_512_1_conv_expand_h"
674
- top: "layer_512_1_sum"
675
- }
676
- layer {
677
- name: "last_bn_h"
678
- type: "BatchNorm"
679
- bottom: "layer_512_1_sum"
680
- top: "layer_512_1_sum"
681
- param {
682
- lr_mult: 0.0
683
- }
684
- param {
685
- lr_mult: 0.0
686
- }
687
- param {
688
- lr_mult: 0.0
689
- }
690
- }
691
- layer {
692
- name: "last_scale_h"
693
- type: "Scale"
694
- bottom: "layer_512_1_sum"
695
- top: "layer_512_1_sum"
696
- param {
697
- lr_mult: 1.0
698
- decay_mult: 1.0
699
- }
700
- param {
701
- lr_mult: 2.0
702
- decay_mult: 1.0
703
- }
704
- scale_param {
705
- bias_term: true
706
- }
707
- }
708
- layer {
709
- name: "last_relu"
710
- type: "ReLU"
711
- bottom: "layer_512_1_sum"
712
- top: "fc7"
713
- }
714
-
715
- layer {
716
- name: "conv6_1_h"
717
- type: "Convolution"
718
- bottom: "fc7"
719
- top: "conv6_1_h"
720
- param {
721
- lr_mult: 1
722
- decay_mult: 1
723
- }
724
- param {
725
- lr_mult: 2
726
- decay_mult: 0
727
- }
728
- convolution_param {
729
- num_output: 128
730
- pad: 0
731
- kernel_size: 1
732
- stride: 1
733
- weight_filler {
734
- type: "xavier"
735
- }
736
- bias_filler {
737
- type: "constant"
738
- value: 0
739
- }
740
- }
741
- }
742
- layer {
743
- name: "conv6_1_relu"
744
- type: "ReLU"
745
- bottom: "conv6_1_h"
746
- top: "conv6_1_h"
747
- }
748
- layer {
749
- name: "conv6_2_h"
750
- type: "Convolution"
751
- bottom: "conv6_1_h"
752
- top: "conv6_2_h"
753
- param {
754
- lr_mult: 1
755
- decay_mult: 1
756
- }
757
- param {
758
- lr_mult: 2
759
- decay_mult: 0
760
- }
761
- convolution_param {
762
- num_output: 256
763
- pad: 1
764
- kernel_size: 3
765
- stride: 2
766
- weight_filler {
767
- type: "xavier"
768
- }
769
- bias_filler {
770
- type: "constant"
771
- value: 0
772
- }
773
- }
774
- }
775
- layer {
776
- name: "conv6_2_relu"
777
- type: "ReLU"
778
- bottom: "conv6_2_h"
779
- top: "conv6_2_h"
780
- }
781
- layer {
782
- name: "conv7_1_h"
783
- type: "Convolution"
784
- bottom: "conv6_2_h"
785
- top: "conv7_1_h"
786
- param {
787
- lr_mult: 1
788
- decay_mult: 1
789
- }
790
- param {
791
- lr_mult: 2
792
- decay_mult: 0
793
- }
794
- convolution_param {
795
- num_output: 64
796
- pad: 0
797
- kernel_size: 1
798
- stride: 1
799
- weight_filler {
800
- type: "xavier"
801
- }
802
- bias_filler {
803
- type: "constant"
804
- value: 0
805
- }
806
- }
807
- }
808
- layer {
809
- name: "conv7_1_relu"
810
- type: "ReLU"
811
- bottom: "conv7_1_h"
812
- top: "conv7_1_h"
813
- }
814
- layer {
815
- name: "conv7_2_h"
816
- type: "Convolution"
817
- bottom: "conv7_1_h"
818
- top: "conv7_2_h"
819
- param {
820
- lr_mult: 1
821
- decay_mult: 1
822
- }
823
- param {
824
- lr_mult: 2
825
- decay_mult: 0
826
- }
827
- convolution_param {
828
- num_output: 128
829
- pad: 1
830
- kernel_size: 3
831
- stride: 2
832
- weight_filler {
833
- type: "xavier"
834
- }
835
- bias_filler {
836
- type: "constant"
837
- value: 0
838
- }
839
- }
840
- }
841
- layer {
842
- name: "conv7_2_relu"
843
- type: "ReLU"
844
- bottom: "conv7_2_h"
845
- top: "conv7_2_h"
846
- }
847
- layer {
848
- name: "conv8_1_h"
849
- type: "Convolution"
850
- bottom: "conv7_2_h"
851
- top: "conv8_1_h"
852
- param {
853
- lr_mult: 1
854
- decay_mult: 1
855
- }
856
- param {
857
- lr_mult: 2
858
- decay_mult: 0
859
- }
860
- convolution_param {
861
- num_output: 64
862
- pad: 0
863
- kernel_size: 1
864
- stride: 1
865
- weight_filler {
866
- type: "xavier"
867
- }
868
- bias_filler {
869
- type: "constant"
870
- value: 0
871
- }
872
- }
873
- }
874
- layer {
875
- name: "conv8_1_relu"
876
- type: "ReLU"
877
- bottom: "conv8_1_h"
878
- top: "conv8_1_h"
879
- }
880
- layer {
881
- name: "conv8_2_h"
882
- type: "Convolution"
883
- bottom: "conv8_1_h"
884
- top: "conv8_2_h"
885
- param {
886
- lr_mult: 1
887
- decay_mult: 1
888
- }
889
- param {
890
- lr_mult: 2
891
- decay_mult: 0
892
- }
893
- convolution_param {
894
- num_output: 128
895
- pad: 1
896
- kernel_size: 3
897
- stride: 1
898
- weight_filler {
899
- type: "xavier"
900
- }
901
- bias_filler {
902
- type: "constant"
903
- value: 0
904
- }
905
- }
906
- }
907
- layer {
908
- name: "conv8_2_relu"
909
- type: "ReLU"
910
- bottom: "conv8_2_h"
911
- top: "conv8_2_h"
912
- }
913
- layer {
914
- name: "conv9_1_h"
915
- type: "Convolution"
916
- bottom: "conv8_2_h"
917
- top: "conv9_1_h"
918
- param {
919
- lr_mult: 1
920
- decay_mult: 1
921
- }
922
- param {
923
- lr_mult: 2
924
- decay_mult: 0
925
- }
926
- convolution_param {
927
- num_output: 64
928
- pad: 0
929
- kernel_size: 1
930
- stride: 1
931
- weight_filler {
932
- type: "xavier"
933
- }
934
- bias_filler {
935
- type: "constant"
936
- value: 0
937
- }
938
- }
939
- }
940
- layer {
941
- name: "conv9_1_relu"
942
- type: "ReLU"
943
- bottom: "conv9_1_h"
944
- top: "conv9_1_h"
945
- }
946
- layer {
947
- name: "conv9_2_h"
948
- type: "Convolution"
949
- bottom: "conv9_1_h"
950
- top: "conv9_2_h"
951
- param {
952
- lr_mult: 1
953
- decay_mult: 1
954
- }
955
- param {
956
- lr_mult: 2
957
- decay_mult: 0
958
- }
959
- convolution_param {
960
- num_output: 128
961
- pad: 1
962
- kernel_size: 3
963
- stride: 1
964
- weight_filler {
965
- type: "xavier"
966
- }
967
- bias_filler {
968
- type: "constant"
969
- value: 0
970
- }
971
- }
972
- }
973
- layer {
974
- name: "conv9_2_relu"
975
- type: "ReLU"
976
- bottom: "conv9_2_h"
977
- top: "conv9_2_h"
978
- }
979
- layer {
980
- name: "conv4_3_norm"
981
- type: "Normalize"
982
- bottom: "layer_256_1_bn1"
983
- top: "conv4_3_norm"
984
- norm_param {
985
- across_spatial: false
986
- scale_filler {
987
- type: "constant"
988
- value: 20
989
- }
990
- channel_shared: false
991
- }
992
- }
993
- layer {
994
- name: "conv4_3_norm_mbox_loc"
995
- type: "Convolution"
996
- bottom: "conv4_3_norm"
997
- top: "conv4_3_norm_mbox_loc"
998
- param {
999
- lr_mult: 1
1000
- decay_mult: 1
1001
- }
1002
- param {
1003
- lr_mult: 2
1004
- decay_mult: 0
1005
- }
1006
- convolution_param {
1007
- num_output: 16
1008
- pad: 1
1009
- kernel_size: 3
1010
- stride: 1
1011
- weight_filler {
1012
- type: "xavier"
1013
- }
1014
- bias_filler {
1015
- type: "constant"
1016
- value: 0
1017
- }
1018
- }
1019
- }
1020
- layer {
1021
- name: "conv4_3_norm_mbox_loc_perm"
1022
- type: "Permute"
1023
- bottom: "conv4_3_norm_mbox_loc"
1024
- top: "conv4_3_norm_mbox_loc_perm"
1025
- permute_param {
1026
- order: 0
1027
- order: 2
1028
- order: 3
1029
- order: 1
1030
- }
1031
- }
1032
- layer {
1033
- name: "conv4_3_norm_mbox_loc_flat"
1034
- type: "Flatten"
1035
- bottom: "conv4_3_norm_mbox_loc_perm"
1036
- top: "conv4_3_norm_mbox_loc_flat"
1037
- flatten_param {
1038
- axis: 1
1039
- }
1040
- }
1041
- layer {
1042
- name: "conv4_3_norm_mbox_conf"
1043
- type: "Convolution"
1044
- bottom: "conv4_3_norm"
1045
- top: "conv4_3_norm_mbox_conf"
1046
- param {
1047
- lr_mult: 1
1048
- decay_mult: 1
1049
- }
1050
- param {
1051
- lr_mult: 2
1052
- decay_mult: 0
1053
- }
1054
- convolution_param {
1055
- num_output: 8 # 84
1056
- pad: 1
1057
- kernel_size: 3
1058
- stride: 1
1059
- weight_filler {
1060
- type: "xavier"
1061
- }
1062
- bias_filler {
1063
- type: "constant"
1064
- value: 0
1065
- }
1066
- }
1067
- }
1068
- layer {
1069
- name: "conv4_3_norm_mbox_conf_perm"
1070
- type: "Permute"
1071
- bottom: "conv4_3_norm_mbox_conf"
1072
- top: "conv4_3_norm_mbox_conf_perm"
1073
- permute_param {
1074
- order: 0
1075
- order: 2
1076
- order: 3
1077
- order: 1
1078
- }
1079
- }
1080
- layer {
1081
- name: "conv4_3_norm_mbox_conf_flat"
1082
- type: "Flatten"
1083
- bottom: "conv4_3_norm_mbox_conf_perm"
1084
- top: "conv4_3_norm_mbox_conf_flat"
1085
- flatten_param {
1086
- axis: 1
1087
- }
1088
- }
1089
- layer {
1090
- name: "conv4_3_norm_mbox_priorbox"
1091
- type: "PriorBox"
1092
- bottom: "conv4_3_norm"
1093
- bottom: "data"
1094
- top: "conv4_3_norm_mbox_priorbox"
1095
- prior_box_param {
1096
- min_size: 30.0
1097
- max_size: 60.0
1098
- aspect_ratio: 2
1099
- flip: true
1100
- clip: false
1101
- variance: 0.1
1102
- variance: 0.1
1103
- variance: 0.2
1104
- variance: 0.2
1105
- step: 8
1106
- offset: 0.5
1107
- }
1108
- }
1109
- layer {
1110
- name: "fc7_mbox_loc"
1111
- type: "Convolution"
1112
- bottom: "fc7"
1113
- top: "fc7_mbox_loc"
1114
- param {
1115
- lr_mult: 1
1116
- decay_mult: 1
1117
- }
1118
- param {
1119
- lr_mult: 2
1120
- decay_mult: 0
1121
- }
1122
- convolution_param {
1123
- num_output: 24
1124
- pad: 1
1125
- kernel_size: 3
1126
- stride: 1
1127
- weight_filler {
1128
- type: "xavier"
1129
- }
1130
- bias_filler {
1131
- type: "constant"
1132
- value: 0
1133
- }
1134
- }
1135
- }
1136
- layer {
1137
- name: "fc7_mbox_loc_perm"
1138
- type: "Permute"
1139
- bottom: "fc7_mbox_loc"
1140
- top: "fc7_mbox_loc_perm"
1141
- permute_param {
1142
- order: 0
1143
- order: 2
1144
- order: 3
1145
- order: 1
1146
- }
1147
- }
1148
- layer {
1149
- name: "fc7_mbox_loc_flat"
1150
- type: "Flatten"
1151
- bottom: "fc7_mbox_loc_perm"
1152
- top: "fc7_mbox_loc_flat"
1153
- flatten_param {
1154
- axis: 1
1155
- }
1156
- }
1157
- layer {
1158
- name: "fc7_mbox_conf"
1159
- type: "Convolution"
1160
- bottom: "fc7"
1161
- top: "fc7_mbox_conf"
1162
- param {
1163
- lr_mult: 1
1164
- decay_mult: 1
1165
- }
1166
- param {
1167
- lr_mult: 2
1168
- decay_mult: 0
1169
- }
1170
- convolution_param {
1171
- num_output: 12 # 126
1172
- pad: 1
1173
- kernel_size: 3
1174
- stride: 1
1175
- weight_filler {
1176
- type: "xavier"
1177
- }
1178
- bias_filler {
1179
- type: "constant"
1180
- value: 0
1181
- }
1182
- }
1183
- }
1184
- layer {
1185
- name: "fc7_mbox_conf_perm"
1186
- type: "Permute"
1187
- bottom: "fc7_mbox_conf"
1188
- top: "fc7_mbox_conf_perm"
1189
- permute_param {
1190
- order: 0
1191
- order: 2
1192
- order: 3
1193
- order: 1
1194
- }
1195
- }
1196
- layer {
1197
- name: "fc7_mbox_conf_flat"
1198
- type: "Flatten"
1199
- bottom: "fc7_mbox_conf_perm"
1200
- top: "fc7_mbox_conf_flat"
1201
- flatten_param {
1202
- axis: 1
1203
- }
1204
- }
1205
- layer {
1206
- name: "fc7_mbox_priorbox"
1207
- type: "PriorBox"
1208
- bottom: "fc7"
1209
- bottom: "data"
1210
- top: "fc7_mbox_priorbox"
1211
- prior_box_param {
1212
- min_size: 60.0
1213
- max_size: 111.0
1214
- aspect_ratio: 2
1215
- aspect_ratio: 3
1216
- flip: true
1217
- clip: false
1218
- variance: 0.1
1219
- variance: 0.1
1220
- variance: 0.2
1221
- variance: 0.2
1222
- step: 16
1223
- offset: 0.5
1224
- }
1225
- }
1226
- layer {
1227
- name: "conv6_2_mbox_loc"
1228
- type: "Convolution"
1229
- bottom: "conv6_2_h"
1230
- top: "conv6_2_mbox_loc"
1231
- param {
1232
- lr_mult: 1
1233
- decay_mult: 1
1234
- }
1235
- param {
1236
- lr_mult: 2
1237
- decay_mult: 0
1238
- }
1239
- convolution_param {
1240
- num_output: 24
1241
- pad: 1
1242
- kernel_size: 3
1243
- stride: 1
1244
- weight_filler {
1245
- type: "xavier"
1246
- }
1247
- bias_filler {
1248
- type: "constant"
1249
- value: 0
1250
- }
1251
- }
1252
- }
1253
- layer {
1254
- name: "conv6_2_mbox_loc_perm"
1255
- type: "Permute"
1256
- bottom: "conv6_2_mbox_loc"
1257
- top: "conv6_2_mbox_loc_perm"
1258
- permute_param {
1259
- order: 0
1260
- order: 2
1261
- order: 3
1262
- order: 1
1263
- }
1264
- }
1265
- layer {
1266
- name: "conv6_2_mbox_loc_flat"
1267
- type: "Flatten"
1268
- bottom: "conv6_2_mbox_loc_perm"
1269
- top: "conv6_2_mbox_loc_flat"
1270
- flatten_param {
1271
- axis: 1
1272
- }
1273
- }
1274
- layer {
1275
- name: "conv6_2_mbox_conf"
1276
- type: "Convolution"
1277
- bottom: "conv6_2_h"
1278
- top: "conv6_2_mbox_conf"
1279
- param {
1280
- lr_mult: 1
1281
- decay_mult: 1
1282
- }
1283
- param {
1284
- lr_mult: 2
1285
- decay_mult: 0
1286
- }
1287
- convolution_param {
1288
- num_output: 12 # 126
1289
- pad: 1
1290
- kernel_size: 3
1291
- stride: 1
1292
- weight_filler {
1293
- type: "xavier"
1294
- }
1295
- bias_filler {
1296
- type: "constant"
1297
- value: 0
1298
- }
1299
- }
1300
- }
1301
- layer {
1302
- name: "conv6_2_mbox_conf_perm"
1303
- type: "Permute"
1304
- bottom: "conv6_2_mbox_conf"
1305
- top: "conv6_2_mbox_conf_perm"
1306
- permute_param {
1307
- order: 0
1308
- order: 2
1309
- order: 3
1310
- order: 1
1311
- }
1312
- }
1313
- layer {
1314
- name: "conv6_2_mbox_conf_flat"
1315
- type: "Flatten"
1316
- bottom: "conv6_2_mbox_conf_perm"
1317
- top: "conv6_2_mbox_conf_flat"
1318
- flatten_param {
1319
- axis: 1
1320
- }
1321
- }
1322
- layer {
1323
- name: "conv6_2_mbox_priorbox"
1324
- type: "PriorBox"
1325
- bottom: "conv6_2_h"
1326
- bottom: "data"
1327
- top: "conv6_2_mbox_priorbox"
1328
- prior_box_param {
1329
- min_size: 111.0
1330
- max_size: 162.0
1331
- aspect_ratio: 2
1332
- aspect_ratio: 3
1333
- flip: true
1334
- clip: false
1335
- variance: 0.1
1336
- variance: 0.1
1337
- variance: 0.2
1338
- variance: 0.2
1339
- step: 32
1340
- offset: 0.5
1341
- }
1342
- }
1343
- layer {
1344
- name: "conv7_2_mbox_loc"
1345
- type: "Convolution"
1346
- bottom: "conv7_2_h"
1347
- top: "conv7_2_mbox_loc"
1348
- param {
1349
- lr_mult: 1
1350
- decay_mult: 1
1351
- }
1352
- param {
1353
- lr_mult: 2
1354
- decay_mult: 0
1355
- }
1356
- convolution_param {
1357
- num_output: 24
1358
- pad: 1
1359
- kernel_size: 3
1360
- stride: 1
1361
- weight_filler {
1362
- type: "xavier"
1363
- }
1364
- bias_filler {
1365
- type: "constant"
1366
- value: 0
1367
- }
1368
- }
1369
- }
1370
- layer {
1371
- name: "conv7_2_mbox_loc_perm"
1372
- type: "Permute"
1373
- bottom: "conv7_2_mbox_loc"
1374
- top: "conv7_2_mbox_loc_perm"
1375
- permute_param {
1376
- order: 0
1377
- order: 2
1378
- order: 3
1379
- order: 1
1380
- }
1381
- }
1382
- layer {
1383
- name: "conv7_2_mbox_loc_flat"
1384
- type: "Flatten"
1385
- bottom: "conv7_2_mbox_loc_perm"
1386
- top: "conv7_2_mbox_loc_flat"
1387
- flatten_param {
1388
- axis: 1
1389
- }
1390
- }
1391
- layer {
1392
- name: "conv7_2_mbox_conf"
1393
- type: "Convolution"
1394
- bottom: "conv7_2_h"
1395
- top: "conv7_2_mbox_conf"
1396
- param {
1397
- lr_mult: 1
1398
- decay_mult: 1
1399
- }
1400
- param {
1401
- lr_mult: 2
1402
- decay_mult: 0
1403
- }
1404
- convolution_param {
1405
- num_output: 12 # 126
1406
- pad: 1
1407
- kernel_size: 3
1408
- stride: 1
1409
- weight_filler {
1410
- type: "xavier"
1411
- }
1412
- bias_filler {
1413
- type: "constant"
1414
- value: 0
1415
- }
1416
- }
1417
- }
1418
- layer {
1419
- name: "conv7_2_mbox_conf_perm"
1420
- type: "Permute"
1421
- bottom: "conv7_2_mbox_conf"
1422
- top: "conv7_2_mbox_conf_perm"
1423
- permute_param {
1424
- order: 0
1425
- order: 2
1426
- order: 3
1427
- order: 1
1428
- }
1429
- }
1430
- layer {
1431
- name: "conv7_2_mbox_conf_flat"
1432
- type: "Flatten"
1433
- bottom: "conv7_2_mbox_conf_perm"
1434
- top: "conv7_2_mbox_conf_flat"
1435
- flatten_param {
1436
- axis: 1
1437
- }
1438
- }
1439
- layer {
1440
- name: "conv7_2_mbox_priorbox"
1441
- type: "PriorBox"
1442
- bottom: "conv7_2_h"
1443
- bottom: "data"
1444
- top: "conv7_2_mbox_priorbox"
1445
- prior_box_param {
1446
- min_size: 162.0
1447
- max_size: 213.0
1448
- aspect_ratio: 2
1449
- aspect_ratio: 3
1450
- flip: true
1451
- clip: false
1452
- variance: 0.1
1453
- variance: 0.1
1454
- variance: 0.2
1455
- variance: 0.2
1456
- step: 64
1457
- offset: 0.5
1458
- }
1459
- }
1460
- layer {
1461
- name: "conv8_2_mbox_loc"
1462
- type: "Convolution"
1463
- bottom: "conv8_2_h"
1464
- top: "conv8_2_mbox_loc"
1465
- param {
1466
- lr_mult: 1
1467
- decay_mult: 1
1468
- }
1469
- param {
1470
- lr_mult: 2
1471
- decay_mult: 0
1472
- }
1473
- convolution_param {
1474
- num_output: 16
1475
- pad: 1
1476
- kernel_size: 3
1477
- stride: 1
1478
- weight_filler {
1479
- type: "xavier"
1480
- }
1481
- bias_filler {
1482
- type: "constant"
1483
- value: 0
1484
- }
1485
- }
1486
- }
1487
- layer {
1488
- name: "conv8_2_mbox_loc_perm"
1489
- type: "Permute"
1490
- bottom: "conv8_2_mbox_loc"
1491
- top: "conv8_2_mbox_loc_perm"
1492
- permute_param {
1493
- order: 0
1494
- order: 2
1495
- order: 3
1496
- order: 1
1497
- }
1498
- }
1499
- layer {
1500
- name: "conv8_2_mbox_loc_flat"
1501
- type: "Flatten"
1502
- bottom: "conv8_2_mbox_loc_perm"
1503
- top: "conv8_2_mbox_loc_flat"
1504
- flatten_param {
1505
- axis: 1
1506
- }
1507
- }
1508
- layer {
1509
- name: "conv8_2_mbox_conf"
1510
- type: "Convolution"
1511
- bottom: "conv8_2_h"
1512
- top: "conv8_2_mbox_conf"
1513
- param {
1514
- lr_mult: 1
1515
- decay_mult: 1
1516
- }
1517
- param {
1518
- lr_mult: 2
1519
- decay_mult: 0
1520
- }
1521
- convolution_param {
1522
- num_output: 8 # 84
1523
- pad: 1
1524
- kernel_size: 3
1525
- stride: 1
1526
- weight_filler {
1527
- type: "xavier"
1528
- }
1529
- bias_filler {
1530
- type: "constant"
1531
- value: 0
1532
- }
1533
- }
1534
- }
1535
- layer {
1536
- name: "conv8_2_mbox_conf_perm"
1537
- type: "Permute"
1538
- bottom: "conv8_2_mbox_conf"
1539
- top: "conv8_2_mbox_conf_perm"
1540
- permute_param {
1541
- order: 0
1542
- order: 2
1543
- order: 3
1544
- order: 1
1545
- }
1546
- }
1547
- layer {
1548
- name: "conv8_2_mbox_conf_flat"
1549
- type: "Flatten"
1550
- bottom: "conv8_2_mbox_conf_perm"
1551
- top: "conv8_2_mbox_conf_flat"
1552
- flatten_param {
1553
- axis: 1
1554
- }
1555
- }
1556
- layer {
1557
- name: "conv8_2_mbox_priorbox"
1558
- type: "PriorBox"
1559
- bottom: "conv8_2_h"
1560
- bottom: "data"
1561
- top: "conv8_2_mbox_priorbox"
1562
- prior_box_param {
1563
- min_size: 213.0
1564
- max_size: 264.0
1565
- aspect_ratio: 2
1566
- flip: true
1567
- clip: false
1568
- variance: 0.1
1569
- variance: 0.1
1570
- variance: 0.2
1571
- variance: 0.2
1572
- step: 100
1573
- offset: 0.5
1574
- }
1575
- }
1576
- layer {
1577
- name: "conv9_2_mbox_loc"
1578
- type: "Convolution"
1579
- bottom: "conv9_2_h"
1580
- top: "conv9_2_mbox_loc"
1581
- param {
1582
- lr_mult: 1
1583
- decay_mult: 1
1584
- }
1585
- param {
1586
- lr_mult: 2
1587
- decay_mult: 0
1588
- }
1589
- convolution_param {
1590
- num_output: 16
1591
- pad: 1
1592
- kernel_size: 3
1593
- stride: 1
1594
- weight_filler {
1595
- type: "xavier"
1596
- }
1597
- bias_filler {
1598
- type: "constant"
1599
- value: 0
1600
- }
1601
- }
1602
- }
1603
- layer {
1604
- name: "conv9_2_mbox_loc_perm"
1605
- type: "Permute"
1606
- bottom: "conv9_2_mbox_loc"
1607
- top: "conv9_2_mbox_loc_perm"
1608
- permute_param {
1609
- order: 0
1610
- order: 2
1611
- order: 3
1612
- order: 1
1613
- }
1614
- }
1615
- layer {
1616
- name: "conv9_2_mbox_loc_flat"
1617
- type: "Flatten"
1618
- bottom: "conv9_2_mbox_loc_perm"
1619
- top: "conv9_2_mbox_loc_flat"
1620
- flatten_param {
1621
- axis: 1
1622
- }
1623
- }
1624
- layer {
1625
- name: "conv9_2_mbox_conf"
1626
- type: "Convolution"
1627
- bottom: "conv9_2_h"
1628
- top: "conv9_2_mbox_conf"
1629
- param {
1630
- lr_mult: 1
1631
- decay_mult: 1
1632
- }
1633
- param {
1634
- lr_mult: 2
1635
- decay_mult: 0
1636
- }
1637
- convolution_param {
1638
- num_output: 8 # 84
1639
- pad: 1
1640
- kernel_size: 3
1641
- stride: 1
1642
- weight_filler {
1643
- type: "xavier"
1644
- }
1645
- bias_filler {
1646
- type: "constant"
1647
- value: 0
1648
- }
1649
- }
1650
- }
1651
- layer {
1652
- name: "conv9_2_mbox_conf_perm"
1653
- type: "Permute"
1654
- bottom: "conv9_2_mbox_conf"
1655
- top: "conv9_2_mbox_conf_perm"
1656
- permute_param {
1657
- order: 0
1658
- order: 2
1659
- order: 3
1660
- order: 1
1661
- }
1662
- }
1663
- layer {
1664
- name: "conv9_2_mbox_conf_flat"
1665
- type: "Flatten"
1666
- bottom: "conv9_2_mbox_conf_perm"
1667
- top: "conv9_2_mbox_conf_flat"
1668
- flatten_param {
1669
- axis: 1
1670
- }
1671
- }
1672
- layer {
1673
- name: "conv9_2_mbox_priorbox"
1674
- type: "PriorBox"
1675
- bottom: "conv9_2_h"
1676
- bottom: "data"
1677
- top: "conv9_2_mbox_priorbox"
1678
- prior_box_param {
1679
- min_size: 264.0
1680
- max_size: 315.0
1681
- aspect_ratio: 2
1682
- flip: true
1683
- clip: false
1684
- variance: 0.1
1685
- variance: 0.1
1686
- variance: 0.2
1687
- variance: 0.2
1688
- step: 300
1689
- offset: 0.5
1690
- }
1691
- }
1692
- layer {
1693
- name: "mbox_loc"
1694
- type: "Concat"
1695
- bottom: "conv4_3_norm_mbox_loc_flat"
1696
- bottom: "fc7_mbox_loc_flat"
1697
- bottom: "conv6_2_mbox_loc_flat"
1698
- bottom: "conv7_2_mbox_loc_flat"
1699
- bottom: "conv8_2_mbox_loc_flat"
1700
- bottom: "conv9_2_mbox_loc_flat"
1701
- top: "mbox_loc"
1702
- concat_param {
1703
- axis: 1
1704
- }
1705
- }
1706
- layer {
1707
- name: "mbox_conf"
1708
- type: "Concat"
1709
- bottom: "conv4_3_norm_mbox_conf_flat"
1710
- bottom: "fc7_mbox_conf_flat"
1711
- bottom: "conv6_2_mbox_conf_flat"
1712
- bottom: "conv7_2_mbox_conf_flat"
1713
- bottom: "conv8_2_mbox_conf_flat"
1714
- bottom: "conv9_2_mbox_conf_flat"
1715
- top: "mbox_conf"
1716
- concat_param {
1717
- axis: 1
1718
- }
1719
- }
1720
- layer {
1721
- name: "mbox_priorbox"
1722
- type: "Concat"
1723
- bottom: "conv4_3_norm_mbox_priorbox"
1724
- bottom: "fc7_mbox_priorbox"
1725
- bottom: "conv6_2_mbox_priorbox"
1726
- bottom: "conv7_2_mbox_priorbox"
1727
- bottom: "conv8_2_mbox_priorbox"
1728
- bottom: "conv9_2_mbox_priorbox"
1729
- top: "mbox_priorbox"
1730
- concat_param {
1731
- axis: 2
1732
- }
1733
- }
1734
-
1735
- layer {
1736
- name: "mbox_conf_reshape"
1737
- type: "Reshape"
1738
- bottom: "mbox_conf"
1739
- top: "mbox_conf_reshape"
1740
- reshape_param {
1741
- shape {
1742
- dim: 0
1743
- dim: -1
1744
- dim: 2
1745
- }
1746
- }
1747
- }
1748
- layer {
1749
- name: "mbox_conf_softmax"
1750
- type: "Softmax"
1751
- bottom: "mbox_conf_reshape"
1752
- top: "mbox_conf_softmax"
1753
- softmax_param {
1754
- axis: 2
1755
- }
1756
- }
1757
- layer {
1758
- name: "mbox_conf_flatten"
1759
- type: "Flatten"
1760
- bottom: "mbox_conf_softmax"
1761
- top: "mbox_conf_flatten"
1762
- flatten_param {
1763
- axis: 1
1764
- }
1765
- }
1766
-
1767
- layer {
1768
- name: "detection_out"
1769
- type: "DetectionOutput"
1770
- bottom: "mbox_loc"
1771
- bottom: "mbox_conf_flatten"
1772
- bottom: "mbox_priorbox"
1773
- top: "detection_out"
1774
- include {
1775
- phase: TEST
1776
- }
1777
- detection_output_param {
1778
- num_classes: 2
1779
- share_location: true
1780
- background_label_id: 0
1781
- nms_param {
1782
- nms_threshold: 0.3
1783
- top_k: 400
1784
- }
1785
- code_type: CENTER_SIZE
1786
- keep_top_k: 200
1787
- confidence_threshold: 0.01
1788
- }
1789
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
download_models.py CHANGED
@@ -1,56 +1,16 @@
 
1
  import os
2
  import re
3
- import cv2
4
- import numpy as np
5
- import io
6
  import sys
 
 
7
  import numpy as np
8
- import timm
9
  import pyiqa
 
10
  import torch
11
- from transformers import DonutProcessor, VisionEncoderDecoderModel
12
-
13
 
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- licence_model = torch.hub.load(
17
- "ultralytics/yolov5", "custom", path="Licenseplate_model.pt", device="cpu", force_reload=True
18
- )
19
- licence_model.cpu()
20
-
21
- detector = cv2.dnn.DetectionModel("res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt")
22
-
23
- processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
24
- doc_qa_model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
25
-
26
- device = "cuda" if torch.cuda.is_available() else "cpu"
27
- doc_qa_model.to(device)
28
-
29
- model = torch.hub.load(
30
- "ultralytics/yolov5", "custom", path="best.pt", device="cpu", force_reload=True
31
- )
32
- model.cpu()
33
-
34
- classes = [
35
- "gas-distribution-meter",
36
- "gas-distribution-piping",
37
- "gas-distribution-regulator",
38
- "gas-distribution-valve"
39
- ]
40
-
41
- class_to_idx = {'gas-distribution-meter': 0,
42
- 'gas-distribution-piping': 1,
43
- 'gas-distribution-regulator': 2,
44
- 'gas-distribution-valve': 3}
45
-
46
- idx_to_classes = {v:k for k,v in class_to_idx.items()}
47
- modelname = "resnet50d"
48
- model_weights = "best_classifer_model.pt"
49
- num_classes = len(classes)
50
-
51
- classifier_model = timm.create_model(
52
- "resnet50d", pretrained=True, num_classes=num_classes, drop_path_rate=0.05
53
- )
54
- classifier_model.load_state_dict(torch.load(model_weights, map_location=torch.device('cpu'))["model_state_dict"])
55
 
56
- musiq_metric = pyiqa.create_metric('musiq-koniq', device=torch.device('cpu'))
 
1
+ import io
2
  import os
3
  import re
 
 
 
4
  import sys
5
+
6
+ import cv2
7
  import numpy as np
 
8
  import pyiqa
9
+ import timm
10
  import torch
11
+ from sentence_transformers import SentenceTransformer
 
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ image_sim_model = SentenceTransformer("clip-ViT-B-32")
res10_300x300_ssd_iter_140000_fp16.caffemodel DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:510ffd2471bd81e3fcc88a5beb4eae4fb445ccf8333ebc54e7302b83f4158a76
3
- size 5351047