Spaces:

myscale
/

object-detection-safari

Running

App Files Files Community

mpsk commited on Mar 3, 2023

Commit

88c0383

1 Parent(s): 1439326

sql-compute-grad (#2)

Browse files

- update compute gradient with sql (98667f6d3f61d4fe705a49978307db02586eeb86)

Files changed (5) hide show

app.py +12 -6
box_utils.py +61 -32
card_model.py +2 -1
classifier.py +46 -37
query_model.py +2 -2

app.py CHANGED Viewed

@@ -192,7 +192,7 @@ def submit(meta):
         zip(
             *(
                 (
-                    v[-1],
                     st.session_state.text_prompts.index(st.session_state[f"label-{i}"]),
                 )
                 for i, v in matches.items()
@@ -329,7 +329,7 @@ try:
         matches = st.session_state.matches
         # initialize classifier
         if "clf" not in st.session_state:
-            st.session_state.clf = Classifier(st.session_state.xq)
             st.session_state.step = 0
         if qtime > 0:
             st.info(
@@ -344,11 +344,13 @@ try:
                     ),
                 )
             )
         # export the model into executable ONNX
         st.session_state.dnld_model = BytesIO()
         torch.onnx.export(
-            torch.nn.Sequential(st.session_state.clf.model, SplitLayer()),
             torch.zeros([1, len(st.session_state.xq[0])]),
             st.session_state.dnld_model,
             input_names=["input"],
@@ -370,7 +372,9 @@ try:
             with st.expander("Top-K Images"):
                 with st.container():
                     boxes_w_img, _ = postprocess(
-                        o_matches, st.session_state.text_prompts, None
                     )
                     boxes_w_img = sorted(boxes_w_img, key=lambda x: x[4], reverse=True)
                     for img_id, img_url, img_w, img_h, img_score, boxes in boxes_w_img:
@@ -428,7 +432,9 @@ try:
                         # Post processing boxes regarding to their score, intersection
                         boxes_w_img, meta = postprocess(
-                            matches, st.session_state.text_prompts, img_matches
                         )
                         # Sort the result according to their relavancy
@@ -452,7 +458,7 @@ try:
                                 img_row[0].write(card(*args), unsafe_allow_html=True)
                                 # crop objects out of the original image
                                 for b in boxes:
-                                    _id, cx, cy, w, h, label, logit, is_selected, _ = b
                                     with img_row[1 + ind_b % 3].container():
                                         st.write("{:s}: {:.4f}".format(label, logit))
                                         # quite hacky: with streamlit components API

         zip(
             *(
                 (
+                    v[0],
                     st.session_state.text_prompts.index(st.session_state[f"label-{i}"]),
                 )
                 for i, v in matches.items()
         matches = st.session_state.matches
         # initialize classifier
         if "clf" not in st.session_state:
+            st.session_state.clf = Classifier(st.session_state.index, OBJ_DB_NAME, st.session_state.xq)
             st.session_state.step = 0
         if qtime > 0:
             st.info(
                     ),
                 )
             )
+        lnprob = torch.nn.Linear(st.session_state.xq.shape[1], st.session_state.xq.shape[0], bias=False)
+        lnprob.weight = torch.nn.Parameter(st.session_state.clf.weight)
         # export the model into executable ONNX
         st.session_state.dnld_model = BytesIO()
         torch.onnx.export(
+            torch.nn.Sequential(lnprob, SplitLayer()),
             torch.zeros([1, len(st.session_state.xq[0])]),
             st.session_state.dnld_model,
             input_names=["input"],
             with st.expander("Top-K Images"):
                 with st.container():
                     boxes_w_img, _ = postprocess(
+                        o_matches, st.session_state.text_prompts, o_matches,
+                        agnostic_ratio=1-0.6**(st.session_state.step+1),
+                        class_ratio=1-0.2**(st.session_state.step+1)
                     )
                     boxes_w_img = sorted(boxes_w_img, key=lambda x: x[4], reverse=True)
                     for img_id, img_url, img_w, img_h, img_score, boxes in boxes_w_img:
                         # Post processing boxes regarding to their score, intersection
                         boxes_w_img, meta = postprocess(
+                            matches, st.session_state.text_prompts, img_matches,
+                            agnostic_ratio=1-0.6**(st.session_state.step+1),
+                            class_ratio=1-0.2**(st.session_state.step+1)
                         )
                         # Sort the result according to their relavancy
                                 img_row[0].write(card(*args), unsafe_allow_html=True)
                                 # crop objects out of the original image
                                 for b in boxes:
+                                    _id, cx, cy, w, h, label, logit, is_selected = b[:8]
                                     with img_row[1 + ind_b % 3].container():
                                         st.write("{:s}: {:.4f}".format(label, logit))
                                         # quite hacky: with streamlit components API

box_utils.py CHANGED Viewed

@@ -2,16 +2,14 @@ import numpy as np
 def cxywh2xywh(cx, cy, w, h):
-    """ CxCyWH format to XYWH format conversion
-    """
     x = cx - w / 2
     y = cy - h / 2
     return x, y, w, h
 def cxywh2ltrb(cx, cy, w, h):
-    """CxCyWH format to LeftRightTopBottom format
-    """
     l = cx - w / 2
     t = cy - h / 2
     r = cx + w / 2
@@ -61,9 +59,16 @@ def nms(cx, cy, w, h, s, iou_thresh=0.3):
         i = sort_ind[0]
         res.append(i)
-        _iou = iou((l[i], t[i], r[i], b[i], areas[i]),
-                   (l[sort_ind[1:]], t[sort_ind[1:]],
-                    r[sort_ind[1:]], b[sort_ind[1:]],  areas[sort_ind[1:]]))
         sel_ind = np.where(_iou <= iou_thresh)[0]
         sort_ind = sort_ind[sel_ind + 1]
     return res
@@ -77,43 +82,64 @@ def filter_nonpos(boxes, agnostic_ratio=0.5, class_ratio=0.7):
     """
     ret = []
     labelwise = {}
-    for _id, cx, cy, w, h, label, logit, is_selected, _ in boxes:
         if label not in labelwise:
             labelwise[label] = []
         labelwise[label].append(logit)
     labelwise = {l: max(s) for l, s in labelwise.items()}
     agnostic = max([v for _, v in labelwise.items()])
     for b in boxes:
-        _id, cx, cy, w, h, label, logit, is_selected, _ = b
-        if logit > class_ratio * labelwise[label] \
-                and logit > agnostic_ratio * agnostic:
             ret.append(b)
     return ret
-def postprocess(matches, prompt_labels, img_matches=None):
     meta = []
     boxes_w_img = []
-    matches_ = {m['img_id']: m for m in matches}
     if img_matches is not None:
-        img_matches_ = {m['img_id']: m for m in img_matches}
     for k in matches_.keys():
         m = matches_[k]
         boxes = []
-        boxes += list(map(list, zip(m['box_id'], m['cx'], m['cy'], m['w'], m['h'],
-                                    [prompt_labels[int(l)]
-                                        for l in m['label']],
-                                    m['logit'], [1] *
-                                    len(m['box_id']),
-                                    list(np.array(m['cls_emb'])))))
         if img_matches is not None and k in img_matches_:
             img_m = img_matches_[k]
             # and also those non-TopK hits and those non-topk are not anticipating training
-            boxes += [i for i in map(list, zip(img_m['box_id'], img_m['cx'], img_m['cy'], img_m['w'], img_m['h'],
-                                               [prompt_labels[int(
-                                                l)] for l in img_m['label']], img_m['logit'],
-                                               [0] * len(img_m['box_id']), list(np.array(img_m['cls_emb']))))
-                      if i[0] not in [b[0] for b in boxes]]
         else:
             img_m = None
         # update record metadata after query
@@ -121,16 +147,19 @@ def postprocess(matches, prompt_labels, img_matches=None):
             meta.append(b[0])
         # remove some non-significant boxes
-        boxes = filter_nonpos(
-            boxes, agnostic_ratio=0.4, class_ratio=0.7)
         # doing non-maximum suppression
-        cx, cy, w, h, s = list(map(lambda x: np.array(x),
-                                   list(zip(*[(*b[1:5], b[6]) for b in boxes]))))
         ind = nms(cx, cy, w, h, s, 0.3)
         boxes = [boxes[i] for i in ind]
         if img_m is not None:
-            img_score = img_m['img_score'] if img_matches is not None else m['img_score']
             boxes_w_img.append(
-                (m["img_id"], m["img_url"], m["img_w"], m["img_h"], img_score, boxes))
-    return boxes_w_img, meta

 def cxywh2xywh(cx, cy, w, h):
+    """CxCyWH format to XYWH format conversion"""
     x = cx - w / 2
     y = cy - h / 2
     return x, y, w, h
 def cxywh2ltrb(cx, cy, w, h):
+    """CxCyWH format to LeftRightTopBottom format"""
     l = cx - w / 2
     t = cy - h / 2
     r = cx + w / 2
         i = sort_ind[0]
         res.append(i)
+        _iou = iou(
+            (l[i], t[i], r[i], b[i], areas[i]),
+            (
+                l[sort_ind[1:]],
+                t[sort_ind[1:]],
+                r[sort_ind[1:]],
+                b[sort_ind[1:]],
+                areas[sort_ind[1:]],
+            ),
+        )
         sel_ind = np.where(_iou <= iou_thresh)[0]
         sort_ind = sort_ind[sel_ind + 1]
     return res
     """
     ret = []
     labelwise = {}
+    for b in boxes:
+        _id, cx, cy, w, h, label, logit, is_selected = b[:8]
         if label not in labelwise:
             labelwise[label] = []
         labelwise[label].append(logit)
     labelwise = {l: max(s) for l, s in labelwise.items()}
     agnostic = max([v for _, v in labelwise.items()])
     for b in boxes:
+        _id, cx, cy, w, h, label, logit, is_selected = b[:8]
+        if logit > class_ratio * labelwise[label] and logit > agnostic_ratio * agnostic:
             ret.append(b)
     return ret
+def postprocess(matches, prompt_labels, img_matches=None, agnostic_ratio=0.4, class_ratio=0.7):
     meta = []
     boxes_w_img = []
+    matches_ = {m["img_id"]: m for m in matches}
     if img_matches is not None:
+        img_matches_ = {m["img_id"]: m for m in img_matches}
     for k in matches_.keys():
         m = matches_[k]
         boxes = []
+        boxes += list(
+            map(
+                list,
+                zip(
+                    m["box_id"],
+                    m["cx"],
+                    m["cy"],
+                    m["w"],
+                    m["h"],
+                    [prompt_labels[int(l)] for l in m["label"]],
+                    m["logit"],
+                    [1] * len(m["box_id"]),
+                ),
+            )
+        )
         if img_matches is not None and k in img_matches_:
             img_m = img_matches_[k]
             # and also those non-TopK hits and those non-topk are not anticipating training
+            boxes += [
+                i
+                for i in map(
+                    list,
+                    zip(
+                        img_m["box_id"],
+                        img_m["cx"],
+                        img_m["cy"],
+                        img_m["w"],
+                        img_m["h"],
+                        [prompt_labels[int(l)] for l in img_m["label"]],
+                        img_m["logit"],
+                        [0] * len(img_m["box_id"]),
+                    ),
+                )
+                if i[0] not in [b[0] for b in boxes]
+            ]
         else:
             img_m = None
         # update record metadata after query
             meta.append(b[0])
         # remove some non-significant boxes
+        boxes = filter_nonpos(boxes, agnostic_ratio=agnostic_ratio, class_ratio=class_ratio)
         # doing non-maximum suppression
+        cx, cy, w, h, s = list(
+            map(lambda x: np.array(x), list(zip(*[(*b[1:5], b[6]) for b in boxes])))
+        )
         ind = nms(cx, cy, w, h, s, 0.3)
         boxes = [boxes[i] for i in ind]
         if img_m is not None:
+            img_score = (
+                img_m["img_score"] if img_matches is not None else m["img_score"]
+            )
             boxes_w_img.append(
+                (m["img_id"], m["img_url"], m["img_w"], m["img_h"], img_score, boxes)
+            )
+    return boxes_w_img, meta

card_model.py CHANGED Viewed

@@ -47,7 +47,8 @@ def card(img_url, img_w, img_h, boxes):
     """
     _boxes = ""
     img_url = convert_img_url(img_url)
-    for _id, cx, cy, w, h, label, logit, is_selected, _ in boxes:
         x, y, w, h = cxywh2xywh(cx, cy, w, h)
         x = round(img_w * x)
         y = round(img_h * y)

     """
     _boxes = ""
     img_url = convert_img_url(img_url)
+    for b in boxes:
+        _id, cx, cy, w, h, label, logit, is_selected = b[:8]
         x, y, w, h = cxywh2xywh(cx, cy, w, h)
         x = round(img_w * x)
         y = round(img_h * y)

classifier.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
-def extract_text_feature(prompt, model, processor, device='cpu'):
     """Extract text features
     Args:
@@ -10,12 +10,11 @@ def extract_text_feature(prompt, model, processor, device='cpu'):
         processor: OwlViT processor
         device (str, optional): device to run. Defaults to 'cpu'.
     """
-    device = 'cpu'
     if torch.cuda.is_available():
-        device = 'cuda'
     with torch.no_grad():
-        input_ids = torch.as_tensor(processor(text=prompt)[
-                                    'input_ids']).to(device)
         print(input_ids.device)
         text_outputs = model.owlvit.text_model(
             input_ids=input_ids,
@@ -32,7 +31,7 @@ def extract_text_feature(prompt, model, processor, device='cpu'):
 def prompt2vec(prompt: str, model, processor):
-    """ Convert prompt into a computational vector
     Args:
         prompt (str): Text to be tokenized
@@ -49,7 +48,7 @@ def prompt2vec(prompt: str, model, processor):
 def tune(clf, X, y, iters=2):
-    """ Train the Zero-shot Classifier
     Args:
         X (numpy.ndarray): Input vectors (retreived vectors)
@@ -62,60 +61,70 @@ def tune(clf, X, y, iters=2):
     # extract new vector
     return clf.get_weights()
 class Classifier:
     """Multi-Class Zero-shot Classifier
     This Classifier provides proxy regarding to the user's reaction to the probed images.
     The proxy will replace the original query vector generated by prompted vector and finally
     give the user a satisfying retrieval result.
-    This can be commonly seen in a recommendation system. The classifier will recommend more
     precise result as it accumulating user's activity.
     This is a multiclass classifier. For N queries it will set the all queries to the first-N classes
     and the last one takes the negative one.
     """
-    def __init__(self, xq: list):
         init_weight = torch.Tensor(xq)
         self.num_class = xq.shape[0]
-        DIMS = xq.shape[1]
-        # note that the bias is ignored, as we only focus on the inner product result
-        self.model = torch.nn.Linear(DIMS, self.num_class, bias=False)
         # convert initial query `xq` to tensor parameter to init weights
-        self.model.weight = torch.nn.Parameter(init_weight)
-        # init loss and optimizer
-        self.loss = torch.nn.BCEWithLogitsLoss()
-        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
     def fit(self, X: list, y: list, iters: int = 5):
         # convert X and y to tensor
-        X = torch.Tensor(X)
-        X /= torch.norm(X, p=2, dim=-1, keepdim=True)
-        y = torch.Tensor(y).long()
-        # Generate labels for binary classification and ignore outbound labels
-        non_ind = y > self.num_class
-        y = torch.nn.functional.one_hot(y % self.num_class, num_classes=self.num_class).float()
-        y[non_ind] = 0
-        for i in range(iters):
             # zero gradients
-            self.optimizer.zero_grad()
             # Normalize the weight before inference
             # This will constrain the gradient or you will have an explosion on query vector
-            self.model.weight.data /= torch.norm(self.model.weight.data, p=2, dim=-1, keepdim=True)
-            # forward pass
-            out = self.model(X)
-            # compute loss
-            loss = self.loss(out, y)
-            # backward pass
-            loss.backward()
             # update weights
-            self.optimizer.step()
     def get_weights(self):
-        xq = self.model.weight.detach().numpy()
         return xq
 class SplitLayer(torch.nn.Module):
     def forward(self, x):
         return torch.split(x, 1, dim=-1)

 import torch
+def extract_text_feature(prompt, model, processor, device="cpu"):
     """Extract text features
     Args:
         processor: OwlViT processor
         device (str, optional): device to run. Defaults to 'cpu'.
     """
+    device = "cpu"
     if torch.cuda.is_available():
+        device = "cuda"
     with torch.no_grad():
+        input_ids = torch.as_tensor(processor(text=prompt)["input_ids"]).to(device)
         print(input_ids.device)
         text_outputs = model.owlvit.text_model(
             input_ids=input_ids,
 def prompt2vec(prompt: str, model, processor):
+    """Convert prompt into a computational vector
     Args:
         prompt (str): Text to be tokenized
 def tune(clf, X, y, iters=2):
+    """Train the Zero-shot Classifier
     Args:
         X (numpy.ndarray): Input vectors (retreived vectors)
     # extract new vector
     return clf.get_weights()
 class Classifier:
     """Multi-Class Zero-shot Classifier
     This Classifier provides proxy regarding to the user's reaction to the probed images.
     The proxy will replace the original query vector generated by prompted vector and finally
     give the user a satisfying retrieval result.
+    This can be commonly seen in a recommendation system. The classifier will recommend more
     precise result as it accumulating user's activity.
     This is a multiclass classifier. For N queries it will set the all queries to the first-N classes
     and the last one takes the negative one.
     """
+    def __init__(self, client, obj_db:str, xq: list):
         init_weight = torch.Tensor(xq)
         self.num_class = xq.shape[0]
+        self.DIMS = xq.shape[1]
         # convert initial query `xq` to tensor parameter to init weights
+        self.weight = init_weight
+        self.client = client
+        self.obj_db = obj_db
     def fit(self, X: list, y: list, iters: int = 5):
         # convert X and y to tensor
+        xq_s = [
+            f"[{', '.join([str(float(fnum)) for fnum in _xq + [1]])}]"
+            for _xq in self.get_weights().tolist()
+        ]
+        for _ in range(iters):
             # zero gradients
+            grad = []
             # Normalize the weight before inference
             # This will constrain the gradient or you will have an explosion on query vector
+            self.weight.data /= torch.norm(
+                self.weight.data, p=2, dim=-1, keepdim=True
+            )
+            for n in range(self.num_class):
+                # select all training sample and create labels
+                labels, objs = list(map(list, zip(*[[1 if y[i]==n else 0, x] for i, x in enumerate(X) if y[i] in [n, self.num_class+1]])))
+                # NOTE from @fangruil
+                # Use SQL to calculate the gradient
+                # For binary cross entropy we have
+                #   g = (1/(1+\exp(-XW))-Y)^TX
+                # To simplify the query, we separated
+                # the calculation into class numbers
+                grad_q_str = f"""
+                    SELECT sumForEachArray(arrayMap((x,y,gt)->arrayMap(i->i*(y-gt), x), X, Y, GT)) AS grad
+                    FROM (
+                        SELECT groupArray(arrayPopBack(prelogit)) AS X,
+                            groupArray(1/(1+exp(-arraySum(arrayMap((x,y)->x*y, prelogit, {xq_s[n]}))))) AS Y, {labels} AS GT
+                        FROM {self.obj_db} WHERE obj_id IN {objs})"""
+                grad.append(torch.as_tensor(self.client.fetch(grad_q_str)[0]['grad']))
             # update weights
+            grad = torch.stack(grad, dim=0)
+            self.weight -= 0.1 * grad
     def get_weights(self):
+        xq = self.weight.detach().numpy()
         return xq
 class SplitLayer(torch.nn.Module):
     def forward(self, x):
         return torch.split(x, 1, dim=-1)

query_model.py CHANGED Viewed

@@ -32,7 +32,7 @@ def topk_obj_query(client, xq, IMG_DB_NAME, OBJ_DB_NAME,
     q_str = f"""
             SELECT img_id, img_url, img_w, img_h, groupArray(obj_id) AS box_id,
                 groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
-                groupArray(pred_logit) AS logit, groupArray(l) as label, groupArray(class_embedding) AS cls_emb,
                 {_img_score_q}
             FROM
                     ({_subq_str})
@@ -68,7 +68,7 @@ def rev_query(client, xq, img_ids, IMG_DB_NAME, OBJ_DB_NAME, thresh=0.08):
     q_str = f"""
             SELECT img_id, groupArray(obj_id) AS box_id, img_url, img_w, img_h,
                 groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
-                groupArray(pred_logit) AS logit, groupArray(l) as label, groupArray(class_embedding) AS cls_emb,
                 {_img_score_q}
             FROM
                 ({_subq_str})

     q_str = f"""
             SELECT img_id, img_url, img_w, img_h, groupArray(obj_id) AS box_id,
                 groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
+                groupArray(pred_logit) AS logit, groupArray(l) as label,
                 {_img_score_q}
             FROM
                     ({_subq_str})
     q_str = f"""
             SELECT img_id, groupArray(obj_id) AS box_id, img_url, img_w, img_h,
                 groupArray(box_cx) AS cx, groupArray(box_cy) AS cy, groupArray(box_w) AS w, groupArray(box_h) AS h,
+                groupArray(pred_logit) AS logit, groupArray(l) as label,
                 {_img_score_q}
             FROM
                 ({_subq_str})