computer-vision-backend

Runtime error

App Files Files Community

abhi001vj commited on Aug 29, 2023

Commit

637af2f

•

1 Parent(s): 31e192b

added the fixes for local

Browse files

Files changed (8) hide show

.env +2 -2
Licenseplate_model.pt +0 -3
app.py +5 -350
best.pt +0 -3
best_classifer_model.pt +0 -3
deploy.prototxt +0 -1789
download_models.py +6 -46
res10_300x300_ssd_iter_140000_fp16.caffemodel +0 -3

.env CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- PINECONE_KEY=~~696a2b15-b4c0-4581-af5d-2d52d0198950~~
2	- PINECONE_ENV=~~us-central1-gcp~~


1	+ PINECONE_KEY=
2	+ PINECONE_ENV=

Licenseplate_model.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c9a080781aa7ff722968c944a702983af8a452753edd5ba20719d42349ec7bd
-size 71780037

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import cv2
-import numpy as np
 import argparse
 import base64
 import io
@@ -9,8 +7,7 @@ import sys
 import traceback
 import uuid
 from typing import List, Optional
-from PIL import ImageEnhance
-import traceback
 import cv2
 import numpy as np
 import pandas as pd
@@ -21,31 +18,20 @@ import torch
 import uvicorn
 from dotenv import load_dotenv
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
-from PIL import Image
 from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer, util
-from transformers import (
-    AutoFeatureExtractor,
-    AutoModel,
-    DonutProcessor,
-    VisionEncoderDecoderModel,
-)
 load_dotenv()
 pinecone.init(api_key=os.getenv("PINECONE_KEY"), environment=os.getenv("PINECONE_ENV"))
-DETECTION_URL = "/object-detection/"
-CLASSIFICATION_URL = "/object-classification/"
-QUALITY_ASSESSMENT_URL = "/quality-assessment/"
-FACE_URL = "/face-anonymization/"
-LICENCE_URL = "/licenceplate-anonymization/"
-DOCUMENT_QA = "/document-qa/"
 IMAGE_SIMILARITY_DEMO = "/find-similar-image/"
 IMAGE_SIMILARITY_PINECONE_DEMO = "/find-similar-image-pinecone/"
 INDEX_NAME = "imagesearch-demo"
 INDEX_DIMENSION = 512
 TMP_DIR = "tmp"
 def enhance_image(pil_image):
     # Convert PIL Image to OpenCV format
@@ -99,353 +85,22 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 os.makedirs(TMP_DIR, exist_ok=True)
-licence_model = torch.hub.load(
-    "ultralytics/yolov5", "custom", path="Licenseplate_model.pt", device="cpu", force_reload=True
-)
-licence_model.cpu()
-detector = cv2.dnn.DetectionModel(
-    "res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt"
-)
-processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-doc_qa_model = VisionEncoderDecoderModel.from_pretrained(
-    "naver-clova-ix/donut-base-finetuned-docvqa"
-)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-doc_qa_model.to(device)
 os.makedirs(TMP_DIR, exist_ok=True)
-model = torch.hub.load(
-    "ultralytics/yolov5", "custom", path="best.pt", device="cpu", force_reload=True
-)
-model.cpu()
-classes = [
-    "gas-distribution-meter",
-    "gas-distribution-piping",
-    "gas-distribution-regulator",
-    "gas-distribution-valve",
-]
-class_to_idx = {
-    "gas-distribution-meter": 0,
-    "gas-distribution-piping": 1,
-    "gas-distribution-regulator": 2,
-    "gas-distribution-valve": 3,
-}
-idx_to_classes = {v: k for k, v in class_to_idx.items()}
-modelname = "resnet50d"
-model_weights = "best_classifer_model.pt"
-num_classes = len(classes)
-classifier_model = timm.create_model(
-    "resnet50d", pretrained=True, num_classes=num_classes, drop_path_rate=0.05
-)
-classifier_model.load_state_dict(
-    torch.load(model_weights, map_location=torch.device("cpu"))["model_state_dict"]
-)
-musiq_metric = pyiqa.create_metric("musiq-koniq", device=torch.device("cpu"))
-image_sim_model = SentenceTransformer("clip-ViT-B-32")
-# model_ckpt = "nateraw/vit-base-beans"
-# extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)
-# image_sim_model = AutoModel.from_pretrained(model_ckpt)
 app = FastAPI(title="CV Demos")
-# Define the Response
-class Prediction(BaseModel):
-    filename: str
-    contenttype: str
-    prediction: List[float] = []
-# define response
-@app.get("/")
-def root_route():
-    return {"error": f"Use GET {DETECTION_URL} instead of the root route!"}
-@app.post(
-    DETECTION_URL,
-)
-async def predict(file: UploadFile = File(...), quality_check: bool = False):
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents))
-        if quality_check:
-            print("RUNNING QUALITY CEHCK BEFORE OBJEFCT DETECTION!!!")
-            tmp_file = f"{TMP_DIR}/tmp.png"
-            pil_image.save(tmp_file)
-            score = musiq_metric(tmp_file)
-            if score < 50:
-                return {
-                    "Error": "Image quality is not sufficient enough to be considered for object detection"
-                }
-        results = model(pil_image, size=640)  # reduce size=320 for faster inference
-        return results.pandas().xyxy[0].to_json(orient="records")
-    except:
-        e = sys.exc_info()[1]
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post(CLASSIFICATION_URL)
-async def classify(file: UploadFile = File(...)):
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents))
-        data_mean = (0.485, 0.456, 0.406)
-        data_std = (0.229, 0.224, 0.225)
-        image_size = (224, 224)
-        eval_transforms = timm.data.create_transform(
-            input_size=image_size, mean=data_mean, std=data_std
-        )
-        eval_transforms(pil_image).unsqueeze(dim=0).shape
-        classifier_model.eval()
-        print("RUNNING Image Classification!!!")
-        max_class_idx = np.argmax(
-            classifier_model(eval_transforms(pil_image).unsqueeze(dim=0)).detach().numpy()
-        )
-        predicted_class = idx_to_classes[max_class_idx]
-        print(f"Predicted Class idx: {max_class_idx} with name : {predicted_class}")
-        return {"object": predicted_class}
-    except:
-        e = sys.exc_info()[1]
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post(QUALITY_ASSESSMENT_URL)
-async def quality_check(file: UploadFile = File(...)):
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents))
-        tmp_file = f"{TMP_DIR}/tmp.png"
-        pil_image.save(tmp_file)
-        score = musiq_metric(tmp_file).detach().numpy().tolist()
-        return {"score": score}
-    except:
-        e = sys.exc_info()[1]
-        raise HTTPException(status_code=500, detail=str(e))
-def anonymize_simple(image, factor=3.0):
-    # automatically determine the size of the blurring kernel based
-    # on the spatial dimensions of the input image
-    (h, w) = image.shape[:2]
-    kW = int(w / factor)
-    kH = int(h / factor)
-    # ensure the width of the kernel is odd
-    if kW % 2 == 0:
-        kW -= 1
-    # ensure the height of the kernel is odd
-    if kH % 2 == 0:
-        kH -= 1
-    # apply a Gaussian blur to the input image using our computed
-    # kernel size
-    return cv2.GaussianBlur(image, (kW, kH), 0)
-def anonymize_pixelate(image, blocks=3):
-    # divide the input image into NxN blocks
-    (h, w) = image.shape[:2]
-    xSteps = np.linspace(0, w, blocks + 1, dtype="int")
-    ySteps = np.linspace(0, h, blocks + 1, dtype="int")
-    # loop over the blocks in both the x and y direction
-    for i in range(1, len(ySteps)):
-        for j in range(1, len(xSteps)):
-            # compute the starting and ending (x, y)-coordinates
-            # for the current block
-            startX = xSteps[j - 1]
-            startY = ySteps[i - 1]
-            endX = xSteps[j]
-            endY = ySteps[i]
-            # extract the ROI using NumPy array slicing, compute the
-            # mean of the ROI, and then draw a rectangle with the
-            # mean RGB values over the ROI in the original image
-            roi = image[startY:endY, startX:endX]
-            (B, G, R) = [int(x) for x in cv2.mean(roi)[:3]]
-            cv2.rectangle(image, (startX, startY), (endX, endY), (B, G, R), -1)
-    # return the pixelated blurred image
-    return image
 # define response
 @app.get("/")
 def root_route():
-    return {"error": f"Use GET {FACE_URL}  or {LICENCE_URL} instead of the root route!"}
-@app.post(
-    FACE_URL,
-)
-async def face_anonymize(
-    file: UploadFile = File(...), blur_type="simple", quality_check: bool = False
-):
-    """
-    https://pyimagesearch.com/2020/04/06/blur-and-anonymize-faces-with-opencv-and-python/
-    """
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents)).convert("RGB")
-        detector = cv2.dnn.DetectionModel(
-            "res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt"
-        )
-        open_cv_image = np.array(pil_image)
-        # Convert RGB to BGR
-        open_cv_image = open_cv_image[:, :, ::-1].copy()
-        (h, w) = open_cv_image.shape[:2]
-        # Getting the detections
-        detections = detector.detect(open_cv_image)
-        if len(detections[2]) > 0:
-            for face in detections[2]:
-                (x, y, w, h) = face.astype("int")
-                # extract the face ROI
-                face = open_cv_image[y : y + h, x : x + w]
-                if blur_type == "simple":
-                    face = anonymize_simple(face)
-                else:
-                    face = anonymize_pixelate(face)
-                open_cv_image[y : y + h, x : x + w] = face
-        _, encoded_img = cv2.imencode(".PNG", open_cv_image)
-        encoded_img = base64.b64encode(encoded_img)
-        return {
-            "filename": file.filename,
-            "dimensions": str(open_cv_image.shape),
-            "encoded_img": encoded_img,
-        }
-    except:
-        e = sys.exc_info()[1]
-        print(traceback.format_exc())
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post(LICENCE_URL)
-async def licence_anonymize(file: UploadFile = File(...), blur_type="simple"):
-    """https://www.kaggle.com/code/gowrishankarp/license-plate-detection-yolov5-pytesseract/notebook#Visualize"""
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents))
-        results = licence_model(pil_image, size=640)  # reduce size=320 for faster inference
-        pil_image = pil_image.convert("RGB")
-        open_cv_image = np.array(pil_image)
-        open_cv_image = open_cv_image[:, :, ::-1].copy()
-        df = results.pandas().xyxy[0]
-        for i, row in df.iterrows():
-            xmin = int(row["xmin"])
-            ymin = int(row["ymin"])
-            xmax = int(row["xmax"])
-            ymax = int(row["ymax"])
-            licence = open_cv_image[ymin:ymax, xmin:xmax]
-            if blur_type == "simple":
-                licence = anonymize_simple(licence)
-            else:
-                licence = anonymize_pixelate(licence)
-            open_cv_image[ymin:ymax, xmin:xmax] = licence
-        _, encoded_img = cv2.imencode(".PNG", open_cv_image)
-        encoded_img = base64.b64encode(encoded_img)
-        return {
-            "filename": file.filename,
-            "dimensions": str(open_cv_image.shape),
-            "encoded_img": encoded_img,
-        }
-    except:
-        e = sys.exc_info()[1]
-        raise HTTPException(status_code=500, detail=str(e))
-def process_document(image, question):
-    # prepare encoder inputs
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-    # prepare decoder inputs
-    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
-    prompt = task_prompt.replace("{user_input}", question)
-    decoder_input_ids = processor.tokenizer(
-        prompt, add_special_tokens=False, return_tensors="pt"
-    ).input_ids
-    # generate answer
-    outputs = doc_qa_model.generate(
-        pixel_values.to(device),
-        decoder_input_ids=decoder_input_ids.to(device),
-        max_length=doc_qa_model.decoder.config.max_position_embeddings,
-        early_stopping=True,
-        pad_token_id=processor.tokenizer.pad_token_id,
-        eos_token_id=processor.tokenizer.eos_token_id,
-        use_cache=True,
-        num_beams=1,
-        bad_words_ids=[[processor.tokenizer.unk_token_id]],
-        return_dict_in_generate=True,
-    )
-    # postprocess
-    sequence = processor.batch_decode(outputs.sequences)[0]
-    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(
-        processor.tokenizer.pad_token, ""
-    )
-    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
-    return processor.token2json(sequence)
-@app.post(DOCUMENT_QA)
-async def document_qa(question: str = Form(...), file: UploadFile = File(...)):
-    try:
-        extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png")
-        if not extension:
-            return "Image must be jpg or png format!"
-        # read image contain
-        contents = await file.read()
-        pil_image = Image.open(io.BytesIO(contents))
-        # tmp_file = f"{TMP_DIR}/tmp.png"
-        # pil_image.save(tmp_file)
-        # answer_git_large = generate_answer_git(git_processor_large, git_model_large, image, question)
-        answer = process_document(pil_image, question)["answer"]
-        return {"answer": answer}
-    except:
-        e = sys.exc_info()[1]
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post(IMAGE_SIMILARITY_DEMO)

 import argparse
 import base64
 import io
 import traceback
 import uuid
 from typing import List, Optional
 import cv2
 import numpy as np
 import pandas as pd
 import uvicorn
 from dotenv import load_dotenv
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from PIL import Image, ImageEnhance
 from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer, util
 load_dotenv()
 pinecone.init(api_key=os.getenv("PINECONE_KEY"), environment=os.getenv("PINECONE_ENV"))
 IMAGE_SIMILARITY_DEMO = "/find-similar-image/"
 IMAGE_SIMILARITY_PINECONE_DEMO = "/find-similar-image-pinecone/"
 INDEX_NAME = "imagesearch-demo"
 INDEX_DIMENSION = 512
 TMP_DIR = "tmp"
+image_sim_model = SentenceTransformer("clip-ViT-B-32")
 def enhance_image(pil_image):
     # Convert PIL Image to OpenCV format
 os.makedirs(TMP_DIR, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 os.makedirs(TMP_DIR, exist_ok=True)
 app = FastAPI(title="CV Demos")
 # define response
 @app.get("/")
 def root_route():
+    return {"error": f"Use GET {IMAGE_SIMILARITY_PINECONE_DEMO} instead of the root route!"}
 @app.post(IMAGE_SIMILARITY_DEMO)

best.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c8faa2592e29248e58453cb031e536bd96f2929d9768bbd3c78ea54944f045db
-size 14447677

best_classifer_model.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4e5c0f63fbe8f8349ceda742cc6c7d333c1a2ae443b6f7aa1d100859d59322a7
-size 377080432

deploy.prototxt DELETED Viewed

@@ -1,1789 +0,0 @@
-input: "data"
-input_shape {
-  dim: 1
-  dim: 3
-  dim: 300
-  dim: 300
-}
-layer {
-  name: "data_bn"
-  type: "BatchNorm"
-  bottom: "data"
-  top: "data_bn"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "data_scale"
-  type: "Scale"
-  bottom: "data_bn"
-  top: "data_bn"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "conv1_h"
-  type: "Convolution"
-  bottom: "data_bn"
-  top: "conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 32
-    pad: 3
-    kernel_size: 7
-    stride: 2
-    weight_filler {
-      type: "msra"
-      variance_norm: FAN_OUT
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv1_bn_h"
-  type: "BatchNorm"
-  bottom: "conv1_h"
-  top: "conv1_h"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "conv1_scale_h"
-  type: "Scale"
-  bottom: "conv1_h"
-  top: "conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "conv1_relu"
-  type: "ReLU"
-  bottom: "conv1_h"
-  top: "conv1_h"
-}
-layer {
-  name: "conv1_pool"
-  type: "Pooling"
-  bottom: "conv1_h"
-  top: "conv1_pool"
-  pooling_param {
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "layer_64_1_conv1_h"
-  type: "Convolution"
-  bottom: "conv1_pool"
-  top: "layer_64_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 32
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_64_1_bn2_h"
-  type: "BatchNorm"
-  bottom: "layer_64_1_conv1_h"
-  top: "layer_64_1_conv1_h"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_64_1_scale2_h"
-  type: "Scale"
-  bottom: "layer_64_1_conv1_h"
-  top: "layer_64_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_64_1_relu2"
-  type: "ReLU"
-  bottom: "layer_64_1_conv1_h"
-  top: "layer_64_1_conv1_h"
-}
-layer {
-  name: "layer_64_1_conv2_h"
-  type: "Convolution"
-  bottom: "layer_64_1_conv1_h"
-  top: "layer_64_1_conv2_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 32
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_64_1_sum"
-  type: "Eltwise"
-  bottom: "layer_64_1_conv2_h"
-  bottom: "conv1_pool"
-  top: "layer_64_1_sum"
-}
-layer {
-  name: "layer_128_1_bn1_h"
-  type: "BatchNorm"
-  bottom: "layer_64_1_sum"
-  top: "layer_128_1_bn1_h"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_128_1_scale1_h"
-  type: "Scale"
-  bottom: "layer_128_1_bn1_h"
-  top: "layer_128_1_bn1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_128_1_relu1"
-  type: "ReLU"
-  bottom: "layer_128_1_bn1_h"
-  top: "layer_128_1_bn1_h"
-}
-layer {
-  name: "layer_128_1_conv1_h"
-  type: "Convolution"
-  bottom: "layer_128_1_bn1_h"
-  top: "layer_128_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 128
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_128_1_bn2"
-  type: "BatchNorm"
-  bottom: "layer_128_1_conv1_h"
-  top: "layer_128_1_conv1_h"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_128_1_scale2"
-  type: "Scale"
-  bottom: "layer_128_1_conv1_h"
-  top: "layer_128_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_128_1_relu2"
-  type: "ReLU"
-  bottom: "layer_128_1_conv1_h"
-  top: "layer_128_1_conv1_h"
-}
-layer {
-  name: "layer_128_1_conv2"
-  type: "Convolution"
-  bottom: "layer_128_1_conv1_h"
-  top: "layer_128_1_conv2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 128
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_128_1_conv_expand_h"
-  type: "Convolution"
-  bottom: "layer_128_1_bn1_h"
-  top: "layer_128_1_conv_expand_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 128
-    bias_term: false
-    pad: 0
-    kernel_size: 1
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_128_1_sum"
-  type: "Eltwise"
-  bottom: "layer_128_1_conv2"
-  bottom: "layer_128_1_conv_expand_h"
-  top: "layer_128_1_sum"
-}
-layer {
-  name: "layer_256_1_bn1"
-  type: "BatchNorm"
-  bottom: "layer_128_1_sum"
-  top: "layer_256_1_bn1"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_256_1_scale1"
-  type: "Scale"
-  bottom: "layer_256_1_bn1"
-  top: "layer_256_1_bn1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_256_1_relu1"
-  type: "ReLU"
-  bottom: "layer_256_1_bn1"
-  top: "layer_256_1_bn1"
-}
-layer {
-  name: "layer_256_1_conv1"
-  type: "Convolution"
-  bottom: "layer_256_1_bn1"
-  top: "layer_256_1_conv1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 256
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_256_1_bn2"
-  type: "BatchNorm"
-  bottom: "layer_256_1_conv1"
-  top: "layer_256_1_conv1"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_256_1_scale2"
-  type: "Scale"
-  bottom: "layer_256_1_conv1"
-  top: "layer_256_1_conv1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_256_1_relu2"
-  type: "ReLU"
-  bottom: "layer_256_1_conv1"
-  top: "layer_256_1_conv1"
-}
-layer {
-  name: "layer_256_1_conv2"
-  type: "Convolution"
-  bottom: "layer_256_1_conv1"
-  top: "layer_256_1_conv2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 256
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_256_1_conv_expand"
-  type: "Convolution"
-  bottom: "layer_256_1_bn1"
-  top: "layer_256_1_conv_expand"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 256
-    bias_term: false
-    pad: 0
-    kernel_size: 1
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_256_1_sum"
-  type: "Eltwise"
-  bottom: "layer_256_1_conv2"
-  bottom: "layer_256_1_conv_expand"
-  top: "layer_256_1_sum"
-}
-layer {
-  name: "layer_512_1_bn1"
-  type: "BatchNorm"
-  bottom: "layer_256_1_sum"
-  top: "layer_512_1_bn1"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_512_1_scale1"
-  type: "Scale"
-  bottom: "layer_512_1_bn1"
-  top: "layer_512_1_bn1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_512_1_relu1"
-  type: "ReLU"
-  bottom: "layer_512_1_bn1"
-  top: "layer_512_1_bn1"
-}
-layer {
-  name: "layer_512_1_conv1_h"
-  type: "Convolution"
-  bottom: "layer_512_1_bn1"
-  top: "layer_512_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 128
-    bias_term: false
-    pad: 1
-    kernel_size: 3
-    stride: 1 # 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_512_1_bn2_h"
-  type: "BatchNorm"
-  bottom: "layer_512_1_conv1_h"
-  top: "layer_512_1_conv1_h"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "layer_512_1_scale2_h"
-  type: "Scale"
-  bottom: "layer_512_1_conv1_h"
-  top: "layer_512_1_conv1_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "layer_512_1_relu2"
-  type: "ReLU"
-  bottom: "layer_512_1_conv1_h"
-  top: "layer_512_1_conv1_h"
-}
-layer {
-  name: "layer_512_1_conv2_h"
-  type: "Convolution"
-  bottom: "layer_512_1_conv1_h"
-  top: "layer_512_1_conv2_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 256
-    bias_term: false
-    pad: 2 # 1
-    kernel_size: 3
-    stride: 1
-    dilation: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_512_1_conv_expand_h"
-  type: "Convolution"
-  bottom: "layer_512_1_bn1"
-  top: "layer_512_1_conv_expand_h"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  convolution_param {
-    num_output: 256
-    bias_term: false
-    pad: 0
-    kernel_size: 1
-    stride: 1 # 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "layer_512_1_sum"
-  type: "Eltwise"
-  bottom: "layer_512_1_conv2_h"
-  bottom: "layer_512_1_conv_expand_h"
-  top: "layer_512_1_sum"
-}
-layer {
-  name: "last_bn_h"
-  type: "BatchNorm"
-  bottom: "layer_512_1_sum"
-  top: "layer_512_1_sum"
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-  param {
-    lr_mult: 0.0
-  }
-}
-layer {
-  name: "last_scale_h"
-  type: "Scale"
-  bottom: "layer_512_1_sum"
-  top: "layer_512_1_sum"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 1.0
-  }
-  scale_param {
-    bias_term: true
-  }
-}
-layer {
-  name: "last_relu"
-  type: "ReLU"
-  bottom: "layer_512_1_sum"
-  top: "fc7"
-}
-layer {
-  name: "conv6_1_h"
-  type: "Convolution"
-  bottom: "fc7"
-  top: "conv6_1_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 0
-    kernel_size: 1
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv6_1_relu"
-  type: "ReLU"
-  bottom: "conv6_1_h"
-  top: "conv6_1_h"
-}
-layer {
-  name: "conv6_2_h"
-  type: "Convolution"
-  bottom: "conv6_1_h"
-  top: "conv6_2_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv6_2_relu"
-  type: "ReLU"
-  bottom: "conv6_2_h"
-  top: "conv6_2_h"
-}
-layer {
-  name: "conv7_1_h"
-  type: "Convolution"
-  bottom: "conv6_2_h"
-  top: "conv7_1_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 0
-    kernel_size: 1
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv7_1_relu"
-  type: "ReLU"
-  bottom: "conv7_1_h"
-  top: "conv7_1_h"
-}
-layer {
-  name: "conv7_2_h"
-  type: "Convolution"
-  bottom: "conv7_1_h"
-  top: "conv7_2_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv7_2_relu"
-  type: "ReLU"
-  bottom: "conv7_2_h"
-  top: "conv7_2_h"
-}
-layer {
-  name: "conv8_1_h"
-  type: "Convolution"
-  bottom: "conv7_2_h"
-  top: "conv8_1_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 0
-    kernel_size: 1
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv8_1_relu"
-  type: "ReLU"
-  bottom: "conv8_1_h"
-  top: "conv8_1_h"
-}
-layer {
-  name: "conv8_2_h"
-  type: "Convolution"
-  bottom: "conv8_1_h"
-  top: "conv8_2_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv8_2_relu"
-  type: "ReLU"
-  bottom: "conv8_2_h"
-  top: "conv8_2_h"
-}
-layer {
-  name: "conv9_1_h"
-  type: "Convolution"
-  bottom: "conv8_2_h"
-  top: "conv9_1_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 0
-    kernel_size: 1
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv9_1_relu"
-  type: "ReLU"
-  bottom: "conv9_1_h"
-  top: "conv9_1_h"
-}
-layer {
-  name: "conv9_2_h"
-  type: "Convolution"
-  bottom: "conv9_1_h"
-  top: "conv9_2_h"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv9_2_relu"
-  type: "ReLU"
-  bottom: "conv9_2_h"
-  top: "conv9_2_h"
-}
-layer {
-  name: "conv4_3_norm"
-  type: "Normalize"
-  bottom: "layer_256_1_bn1"
-  top: "conv4_3_norm"
-  norm_param {
-    across_spatial: false
-    scale_filler {
-      type: "constant"
-      value: 20
-    }
-    channel_shared: false
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_loc"
-  type: "Convolution"
-  bottom: "conv4_3_norm"
-  top: "conv4_3_norm_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv4_3_norm_mbox_loc"
-  top: "conv4_3_norm_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv4_3_norm_mbox_loc_perm"
-  top: "conv4_3_norm_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_conf"
-  type: "Convolution"
-  bottom: "conv4_3_norm"
-  top: "conv4_3_norm_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 8 # 84
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv4_3_norm_mbox_conf"
-  top: "conv4_3_norm_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv4_3_norm_mbox_conf_perm"
-  top: "conv4_3_norm_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv4_3_norm_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv4_3_norm"
-  bottom: "data"
-  top: "conv4_3_norm_mbox_priorbox"
-  prior_box_param {
-    min_size: 30.0
-    max_size: 60.0
-    aspect_ratio: 2
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 8
-    offset: 0.5
-  }
-}
-layer {
-  name: "fc7_mbox_loc"
-  type: "Convolution"
-  bottom: "fc7"
-  top: "fc7_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "fc7_mbox_loc_perm"
-  type: "Permute"
-  bottom: "fc7_mbox_loc"
-  top: "fc7_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "fc7_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "fc7_mbox_loc_perm"
-  top: "fc7_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "fc7_mbox_conf"
-  type: "Convolution"
-  bottom: "fc7"
-  top: "fc7_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 12 # 126
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "fc7_mbox_conf_perm"
-  type: "Permute"
-  bottom: "fc7_mbox_conf"
-  top: "fc7_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "fc7_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "fc7_mbox_conf_perm"
-  top: "fc7_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "fc7_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "fc7"
-  bottom: "data"
-  top: "fc7_mbox_priorbox"
-  prior_box_param {
-    min_size: 60.0
-    max_size: 111.0
-    aspect_ratio: 2
-    aspect_ratio: 3
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 16
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv6_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv6_2_h"
-  top: "conv6_2_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv6_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv6_2_mbox_loc"
-  top: "conv6_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv6_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv6_2_mbox_loc_perm"
-  top: "conv6_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv6_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv6_2_h"
-  top: "conv6_2_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 12 # 126
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv6_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv6_2_mbox_conf"
-  top: "conv6_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv6_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv6_2_mbox_conf_perm"
-  top: "conv6_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv6_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv6_2_h"
-  bottom: "data"
-  top: "conv6_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 111.0
-    max_size: 162.0
-    aspect_ratio: 2
-    aspect_ratio: 3
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 32
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv7_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv7_2_h"
-  top: "conv7_2_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv7_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv7_2_mbox_loc"
-  top: "conv7_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv7_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv7_2_mbox_loc_perm"
-  top: "conv7_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv7_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv7_2_h"
-  top: "conv7_2_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 12 # 126
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv7_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv7_2_mbox_conf"
-  top: "conv7_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv7_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv7_2_mbox_conf_perm"
-  top: "conv7_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv7_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv7_2_h"
-  bottom: "data"
-  top: "conv7_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 162.0
-    max_size: 213.0
-    aspect_ratio: 2
-    aspect_ratio: 3
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 64
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv8_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv8_2_h"
-  top: "conv8_2_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv8_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv8_2_mbox_loc"
-  top: "conv8_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv8_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv8_2_mbox_loc_perm"
-  top: "conv8_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv8_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv8_2_h"
-  top: "conv8_2_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 8 # 84
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv8_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv8_2_mbox_conf"
-  top: "conv8_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv8_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv8_2_mbox_conf_perm"
-  top: "conv8_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv8_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv8_2_h"
-  bottom: "data"
-  top: "conv8_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 213.0
-    max_size: 264.0
-    aspect_ratio: 2
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 100
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv9_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv9_2_h"
-  top: "conv9_2_mbox_loc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv9_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv9_2_mbox_loc"
-  top: "conv9_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv9_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv9_2_mbox_loc_perm"
-  top: "conv9_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv9_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv9_2_h"
-  top: "conv9_2_mbox_conf"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 8 # 84
-    pad: 1
-    kernel_size: 3
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "conv9_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv9_2_mbox_conf"
-  top: "conv9_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv9_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv9_2_mbox_conf_perm"
-  top: "conv9_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv9_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv9_2_h"
-  bottom: "data"
-  top: "conv9_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 264.0
-    max_size: 315.0
-    aspect_ratio: 2
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    step: 300
-    offset: 0.5
-  }
-}
-layer {
-  name: "mbox_loc"
-  type: "Concat"
-  bottom: "conv4_3_norm_mbox_loc_flat"
-  bottom: "fc7_mbox_loc_flat"
-  bottom: "conv6_2_mbox_loc_flat"
-  bottom: "conv7_2_mbox_loc_flat"
-  bottom: "conv8_2_mbox_loc_flat"
-  bottom: "conv9_2_mbox_loc_flat"
-  top: "mbox_loc"
-  concat_param {
-    axis: 1
-  }
-}
-layer {
-  name: "mbox_conf"
-  type: "Concat"
-  bottom: "conv4_3_norm_mbox_conf_flat"
-  bottom: "fc7_mbox_conf_flat"
-  bottom: "conv6_2_mbox_conf_flat"
-  bottom: "conv7_2_mbox_conf_flat"
-  bottom: "conv8_2_mbox_conf_flat"
-  bottom: "conv9_2_mbox_conf_flat"
-  top: "mbox_conf"
-  concat_param {
-    axis: 1
-  }
-}
-layer {
-  name: "mbox_priorbox"
-  type: "Concat"
-  bottom: "conv4_3_norm_mbox_priorbox"
-  bottom: "fc7_mbox_priorbox"
-  bottom: "conv6_2_mbox_priorbox"
-  bottom: "conv7_2_mbox_priorbox"
-  bottom: "conv8_2_mbox_priorbox"
-  bottom: "conv9_2_mbox_priorbox"
-  top: "mbox_priorbox"
-  concat_param {
-    axis: 2
-  }
-}
-layer {
-  name: "mbox_conf_reshape"
-  type: "Reshape"
-  bottom: "mbox_conf"
-  top: "mbox_conf_reshape"
-  reshape_param {
-    shape {
-      dim: 0
-      dim: -1
-      dim: 2
-    }
-  }
-}
-layer {
-  name: "mbox_conf_softmax"
-  type: "Softmax"
-  bottom: "mbox_conf_reshape"
-  top: "mbox_conf_softmax"
-  softmax_param {
-    axis: 2
-  }
-}
-layer {
-  name: "mbox_conf_flatten"
-  type: "Flatten"
-  bottom: "mbox_conf_softmax"
-  top: "mbox_conf_flatten"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "detection_out"
-  type: "DetectionOutput"
-  bottom: "mbox_loc"
-  bottom: "mbox_conf_flatten"
-  bottom: "mbox_priorbox"
-  top: "detection_out"
-  include {
-    phase: TEST
-  }
-  detection_output_param {
-    num_classes: 2
-    share_location: true
-    background_label_id: 0
-    nms_param {
-      nms_threshold: 0.3
-      top_k: 400
-    }
-    code_type: CENTER_SIZE
-    keep_top_k: 200
-    confidence_threshold: 0.01
-  }
-}

download_models.py CHANGED Viewed

@@ -1,56 +1,16 @@
 import os
 import re
-import cv2
-import numpy as np
-import io
 import sys
 import numpy as np
-import timm
 import pyiqa
 import torch
-from transformers import DonutProcessor, VisionEncoderDecoderModel
 device = "cuda" if torch.cuda.is_available() else "cpu"
-licence_model = torch.hub.load(
-    "ultralytics/yolov5", "custom", path="Licenseplate_model.pt", device="cpu", force_reload=True
-)
-licence_model.cpu()
-detector = cv2.dnn.DetectionModel("res10_300x300_ssd_iter_140000_fp16.caffemodel", "deploy.prototxt")
-processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-doc_qa_model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-doc_qa_model.to(device)
-model = torch.hub.load(
-    "ultralytics/yolov5", "custom", path="best.pt", device="cpu", force_reload=True
-)
-model.cpu()
-classes = [
-"gas-distribution-meter",
-"gas-distribution-piping",
-"gas-distribution-regulator",
-"gas-distribution-valve"
-]
-class_to_idx = {'gas-distribution-meter': 0,
- 'gas-distribution-piping': 1,
- 'gas-distribution-regulator': 2,
- 'gas-distribution-valve': 3}
-idx_to_classes = {v:k for k,v in class_to_idx.items()}
-modelname = "resnet50d"
-model_weights = "best_classifer_model.pt"
-num_classes = len(classes)
-classifier_model = timm.create_model(
-        "resnet50d", pretrained=True, num_classes=num_classes, drop_path_rate=0.05
-    )
-classifier_model.load_state_dict(torch.load(model_weights, map_location=torch.device('cpu'))["model_state_dict"])
-musiq_metric = pyiqa.create_metric('musiq-koniq', device=torch.device('cpu'))

+import io
 import os
 import re
 import sys
+import cv2
 import numpy as np
 import pyiqa
+import timm
 import torch
+from sentence_transformers import SentenceTransformer
 device = "cuda" if torch.cuda.is_available() else "cpu"
+image_sim_model = SentenceTransformer("clip-ViT-B-32")

res10_300x300_ssd_iter_140000_fp16.caffemodel DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:510ffd2471bd81e3fcc88a5beb4eae4fb445ccf8333ebc54e7302b83f4158a76
-size 5351047