Spaces:

grostaco
/

IRRA

Sleeping

App Files Files Community

grostaco commited on Dec 5, 2023

Commit

9ff0cd2

•

1 Parent(s): 5aef4a3

feat: add segmentation

Browse files

Files changed (3) hide show

app.py +25 -14
lib/utils/model.py +53 -4
pages/losses.py +42 -33

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
-from st_pages import Page, show_pages, add_page_title, Section
-from lib.utils.model import get_model, get_similarities
-from lib.utils.timer import timer
 add_page_title()
@@ -23,23 +23,31 @@ caption = st.text_input('Description Input')
 images = st.file_uploader('Upload images', accept_multiple_files=True)
 if images is not None:
-    st.image(images) # type: ignore
 st.header('Options')
 st.subheader('Ranks', help='How many predictions the model is allowed to make')
-ranks = st.slider('slider_ranks', min_value=1, max_value=10, label_visibility='collapsed',value=5)
-button = st.button('Match most similar', disabled=len(images) == 0 or caption == '')
 if button:
     st.header('Results')
     with st.spinner('Loading model'):
         model = get_model()
-    st.text(f'IRRA model loaded with {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters')
     time = timer()
     with st.spinner('Computing and ranking similarities'):
         with timer() as t:
@@ -47,15 +55,16 @@ if button:
     elapsed = t()
     indices = similarities.argsort(descending=True).cpu().tolist()[:ranks]
     c1, c2, c3 = st.columns(3)
     with c1:
         st.subheader('Rank')
     with c2:
         st.subheader('Image')
     with c3:
-        st.subheader('Cosine Similarity', help='Due to the nature of the SDM loss, the higher the similarity, the more similar the match is')
     for i, idx in enumerate(indices):
         c1, c2, c3 = st.columns(3)
         with c1:
@@ -72,5 +81,7 @@ with st.sidebar:
     st.subheader('Useful Links')
     st.markdown('[arXiv: Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval](https://arxiv.org/abs/2303.12501)')
-    st.markdown('[IRRA implementation (Pytorch Lightning + Transformers)](https://github.com/grostaco/modern-IRRA)')
-    st.markdown('[IRRA implementation (PyTorch)](https://github.com/anosorae/IRRA/tree/main)')

 import streamlit as st
+from st_pages import Page, show_pages, add_page_title, Section
+from lib.utils.model import get_model, get_similarities, get_detr, segment_images
+from lib.utils.timer import timer
 add_page_title()
 images = st.file_uploader('Upload images', accept_multiple_files=True)
 if images is not None:
+    st.image(images)  # type: ignore
 st.header('Options')
 st.subheader('Ranks', help='How many predictions the model is allowed to make')
+ranks = st.slider('slider_ranks', min_value=1, max_value=10,
+                  label_visibility='collapsed', value=5)
+do_segment = st.checkbox('Segment images with DETR', value=False)
+button = st.button('Match most similar', disabled=len(
+    images) == 0 or caption == '')
 if button:
+    if do_segment:
+        detr, processor = get_detr()
+        images = segment_images(detr, processor, images)
     st.header('Results')
     with st.spinner('Loading model'):
         model = get_model()
+    st.text(
+        f'IRRA model loaded with {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters')
     time = timer()
     with st.spinner('Computing and ranking similarities'):
         with timer() as t:
     elapsed = t()
     indices = similarities.argsort(descending=True).cpu().tolist()[:ranks]
     c1, c2, c3 = st.columns(3)
     with c1:
         st.subheader('Rank')
     with c2:
         st.subheader('Image')
     with c3:
+        st.subheader('Cosine Similarity',
+                     help='Due to the nature of the SDM loss, the higher the similarity, the more similar the match is')
     for i, idx in enumerate(indices):
         c1, c2, c3 = st.columns(3)
         with c1:
     st.subheader('Useful Links')
     st.markdown('[arXiv: Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval](https://arxiv.org/abs/2303.12501)')
+    st.markdown(
+        '[IRRA implementation (Pytorch Lightning + Transformers)](https://github.com/grostaco/modern-IRRA)')
+    st.markdown(
+        '[IRRA implementation (PyTorch)](https://github.com/anosorae/IRRA/tree/main)')

lib/utils/model.py CHANGED Viewed

@@ -1,15 +1,20 @@
-import streamlit as st
 import yaml
 import torch
 import torch.nn.functional as F
 from lib.IRRA.tokenizer import tokenize, SimpleTokenizer
 from lib.IRRA.image import prepare_images
 from lib.IRRA.model.build import build_model, IRRA
 from easydict import EasyDict
-@st.cache_resource
 def get_model():
     args = yaml.load(open('model/configs.yaml'), Loader=yaml.FullLoader)
     args = EasyDict(args)
@@ -17,7 +22,51 @@ def get_model():
     model = build_model(args)
-    return model
 def get_similarities(text: str, images: list[str], model: IRRA) -> torch.Tensor:
     tokenizer = SimpleTokenizer()
@@ -30,5 +79,5 @@ def get_similarities(text: str, images: list[str], model: IRRA) -> torch.Tensor:
     image_feats = F.normalize(image_feats, p=2, dim=1)
     text_feats = F.normalize(text_feats, p=2, dim=1)
     return text_feats @ image_feats.t()

+import streamlit as st
 import yaml
 import torch
 import torch.nn.functional as F
+from transformers import DetrImageProcessor, DetrForObjectDetection
 from lib.IRRA.tokenizer import tokenize, SimpleTokenizer
 from lib.IRRA.image import prepare_images
 from lib.IRRA.model.build import build_model, IRRA
+from PIL import Image
+from pathlib import Path
 from easydict import EasyDict
+@st.cache_resource
 def get_model():
     args = yaml.load(open('model/configs.yaml'), Loader=yaml.FullLoader)
     args = EasyDict(args)
     model = build_model(args)
+    return model
+@st.cache_resource
+def get_detr():
+    processor = DetrImageProcessor.from_pretrained(
+        "facebook/detr-resnet-50", revision="no_timm")
+    model = DetrForObjectDetection.from_pretrained(
+        "facebook/detr-resnet-50", revision="no_timm")
+    return model, processor
+def segment_images(model, processor, images: list[str]):
+    segments = []
+    id = 0
+    p = Path('segments')
+    p.mkdir(exist_ok=True)
+    for image in images:
+        image = Image.open(image)
+        inputs = processor(images=image, return_tensors="pt")
+        outputs = model(**inputs)
+        target_sizes = torch.tensor([image.size[::-1]])
+        results = processor.post_process_object_detection(
+            outputs, target_sizes=target_sizes, threshold=0.9)[0]
+        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+            box = [round(i, 2) for i in box.tolist()]
+            label = model.config.id2label[label.item()]
+            if box[2] - box[0] > 70 and box[3] - box[1] > 70:
+                if label == 'person':
+                    file = p / f'img_{id}.jpg'
+                    image.crop(box).save(file)
+                    segments.append(file.as_posix())
+                    id += 1
+    return segments
 def get_similarities(text: str, images: list[str], model: IRRA) -> torch.Tensor:
     tokenizer = SimpleTokenizer()
     image_feats = F.normalize(image_feats, p=2, dim=1)
     text_feats = F.normalize(text_feats, p=2, dim=1)
     return text_feats @ image_feats.t()

pages/losses.py CHANGED Viewed

@@ -4,36 +4,45 @@ from st_pages import add_indentation
 add_indentation()
 st.title('Loss functions')
-st.subheader('SDM Loss')
-st.markdown('''
-            The similarity distribution matching (SDM) loss, which is the KL divergence
-            of the image to text and text to image to the label distribution.
-            We define $f^v$ and $f^t$ to be the global representation of the visual and textual features respectively.
-            The cosine similarity $sim(u, v) = \\frac{u \\cdot v}{|u||v|}$ will be used to compute the probability of the labels.
-            We define $y_{i, j}=1$ if the visual feature $f^v_i$ matches the textual feature $f^t_j$, else $y_{i, j}=0$.
-            The predicted label distribution can be formulated by''')
-st.latex(r'''
-p_{i} = \sigma(sim(f^v_i, f^t))
-''')
-st.markdown('''
-We can define the image to text loss as
-''')
-st.latex(r'''
-\mathcal{L}_{i2t} = KL(\mathbf{p_i} || \mathbf{q_i})
-''')
-st.markdown('Where $\\mathbf{q_i}$, the true probability distribution, is defined as')
-st.latex(r'''
-q_{i, j} = \frac{y_{i, j}}{\sum_{k=1}^{N} y_{i, k}}
-''')
-st.markdown('It should be noted that the reason this computation is needed is because there could be multiple correct labels.')
-st.subheader('IRR (MLM) Loss')
-st.subheader('ID Loss')

 add_indentation()
 st.title('Loss functions')
+st.markdown('In order to align textual and visual features, multiple loss functions are employed. '
+            'The most notable loss function was proposed in [arXiv: Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval](https://arxiv.org/abs/2303.12501) '
+            'with the introduction of the SDM loss and the usage of the IRR (Implicit Reason Relations) loss.')
+with st.expander('SDM Loss'):
+    st.markdown('''
+                The similarity distribution matching (SDM) loss, which is the KL divergence
+                of the image to text and text to image to the label distribution.
+                We define $f^v$ and $f^t$ to be the global representation of the visual and textual features respectively.
+                The cosine similarity $sim(u, v) = \\frac{u \\cdot v}{|u||v|}$ will be used to compute the probability of the labels.
+                We define $y_{i, j}=1$ if the visual feature $f^v_i$ matches the textual feature $f^t_j$, else $y_{i, j}=0$.
+                The predicted label distribution can be formulated by''')
+    st.latex(r'''
+    p_{i} = \sigma(sim(f^v_i, f^t))
+    ''')
+    st.markdown('''
+    We can define the image to text loss as
+    ''')
+    st.latex(r'''
+    \mathcal{L}_{i2t} = KL(\mathbf{p_i} || \mathbf{q_i})
+    ''')
+    st.markdown('Where $\\mathbf{q_i}$, the true probability distribution, is defined as')
+    st.latex(r'''
+    q_{i, j} = \frac{y_{i, j}}{\sum_{k=1}^{N} y_{i, k}}
+    ''')
+    st.markdown('It should be noted that the reason this computation is needed is because there could be multiple correct labels.')
+    st.markdown('The SDM loss can be formulated as')
+    st.latex(r'''
+    \mathcal{L}_{sdm} = \mathcal{L}_{i2t} + \mathcal{L}_{t2i}
+    ''')
+with st.expander('IRR (MLM) Loss'):
+    ...
+with st.expander('ID Loss'):
+    ...