Spaces:
Runtime error
Runtime error
Jerome Blin
commited on
Commit
•
7484424
1
Parent(s):
ee9b74a
Add application file
Browse files- app.py +1 -0
- fis/__init__.py +0 -0
- fis/__pycache__/__init__.cpython-37.pyc +0 -0
- fis/app/__pycache__/app.cpython-37.pyc +0 -0
- fis/app/app.py +39 -0
- fis/feature_extraction/__pycache__/run.cpython-37.pyc +0 -0
- fis/feature_extraction/detection/__pycache__/base.cpython-37.pyc +0 -0
- fis/feature_extraction/detection/__pycache__/dummy.cpython-37.pyc +0 -0
- fis/feature_extraction/detection/base.py +18 -0
- fis/feature_extraction/detection/dummy.py +23 -0
- fis/feature_extraction/embedding/__pycache__/base.cpython-37.pyc +0 -0
- fis/feature_extraction/embedding/__pycache__/timm.cpython-37.pyc +0 -0
- fis/feature_extraction/embedding/base.py +18 -0
- fis/feature_extraction/embedding/timm.py +63 -0
- fis/feature_extraction/pipeline/__pycache__/base.cpython-37.pyc +0 -0
- fis/feature_extraction/pipeline/__pycache__/factory.cpython-37.pyc +0 -0
- fis/feature_extraction/pipeline/__pycache__/pipeline.cpython-37.pyc +0 -0
- fis/feature_extraction/pipeline/base.py +85 -0
- fis/feature_extraction/pipeline/factory.py +51 -0
- fis/feature_extraction/pipeline/pipeline.py +11 -0
- fis/feature_extraction/run.py +51 -0
- fis/similarity_search/milvus/__pycache__/collection.cpython-37.pyc +0 -0
- fis/similarity_search/milvus/collection.py +58 -0
- fis/utils/__pycache__/config.cpython-37.pyc +0 -0
- fis/utils/__pycache__/constants.cpython-37.pyc +0 -0
- fis/utils/__pycache__/s3.cpython-37.pyc +0 -0
- fis/utils/config.py +16 -0
- fis/utils/constants.py +1 -0
- fis/utils/data/__pycache__/download_fashionpedia.cpython-37.pyc +0 -0
- fis/utils/data/download_fashionpedia.py +56 -0
- fis/utils/s3.py +38 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from fis.app import app # noqa: F401
|
fis/__init__.py
ADDED
File without changes
|
fis/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (149 Bytes). View file
|
|
fis/app/__pycache__/app.cpython-37.pyc
ADDED
Binary file (1.35 kB). View file
|
|
fis/app/app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
+
from datasets import load_dataset
|
7 |
+
from PIL.Image import Image as Img
|
8 |
+
|
9 |
+
from fis.feature_extraction.pipeline.pipeline import factory
|
10 |
+
from fis.utils.constants import ORGANISATION
|
11 |
+
from fis.utils.s3 import read_image_from_s3
|
12 |
+
|
13 |
+
# Ugly fix of "OMP: Error #15: Initializing libomp.a, but found libiomp5.dylib already initialized."
|
14 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
|
15 |
+
|
16 |
+
|
17 |
+
PIPELINE_NAME = "dummy_swin_pipe"
|
18 |
+
|
19 |
+
pipeline = factory.get(PIPELINE_NAME)
|
20 |
+
|
21 |
+
DATASET_PATH = os.path.join(ORGANISATION, "dummy_swin_pipe_debug")
|
22 |
+
dataset = load_dataset(path=DATASET_PATH, split="train")
|
23 |
+
dataset.add_faiss_index(column="embedding")
|
24 |
+
|
25 |
+
|
26 |
+
def find_most_similar(image: np.ndarray) -> List[Img]:
|
27 |
+
image_embeddings = pipeline.encode(image)[0]
|
28 |
+
|
29 |
+
scores, samples = dataset.get_nearest_examples("embedding", image_embeddings, k=5)
|
30 |
+
|
31 |
+
images = []
|
32 |
+
for image_path in samples["path"]:
|
33 |
+
image = read_image_from_s3(image_path)
|
34 |
+
images.append(image)
|
35 |
+
|
36 |
+
return images
|
37 |
+
|
38 |
+
|
39 |
+
gr.Interface(fn=find_most_similar, inputs="image", outputs=["image" for i in range(5)]).launch()
|
fis/feature_extraction/__pycache__/run.cpython-37.pyc
ADDED
Binary file (1.52 kB). View file
|
|
fis/feature_extraction/detection/__pycache__/base.cpython-37.pyc
ADDED
Binary file (763 Bytes). View file
|
|
fis/feature_extraction/detection/__pycache__/dummy.cpython-37.pyc
ADDED
Binary file (962 Bytes). View file
|
|
fis/feature_extraction/detection/base.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
class BaseDetector(ABC):
|
7 |
+
"""Base class for detection models."""
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def __call__(self, image: Image) -> None:
|
11 |
+
"""Get embeddings from an image.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
image: Image to encode
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
Embedding
|
18 |
+
"""
|
fis/feature_extraction/detection/dummy.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
from fis.feature_extraction.detection.base import BaseDetector
|
6 |
+
|
7 |
+
|
8 |
+
class DummyDetector(BaseDetector):
|
9 |
+
"""Dummy detection model."""
|
10 |
+
|
11 |
+
def __call__(self, image: Image) -> List[Tuple[int]]:
|
12 |
+
"""Return a bounding box with the same size as the image.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
image: Image
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
Dummy bounding box the same size as the image
|
19 |
+
"""
|
20 |
+
x_min, y_min = 0, 0
|
21 |
+
x_max, y_max = image.size
|
22 |
+
|
23 |
+
return [(x_min, y_min, x_max, y_max)]
|
fis/feature_extraction/embedding/__pycache__/base.cpython-37.pyc
ADDED
Binary file (753 Bytes). View file
|
|
fis/feature_extraction/embedding/__pycache__/timm.cpython-37.pyc
ADDED
Binary file (2.19 kB). View file
|
|
fis/feature_extraction/embedding/base.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
class BaseEncoder(ABC):
|
7 |
+
"""Base class for encoders."""
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def __call__(self, image: Image) -> None:
|
11 |
+
"""Get embeddings from an image.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
image: Image to encode
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
Embedding
|
18 |
+
"""
|
fis/feature_extraction/embedding/timm.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, Tuple
|
2 |
+
|
3 |
+
import timm
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
from timm.data import resolve_data_config
|
7 |
+
from timm.data.transforms_factory import create_transform
|
8 |
+
|
9 |
+
from fis.feature_extraction.embedding.base import BaseEncoder
|
10 |
+
|
11 |
+
|
12 |
+
class TimmModel(BaseEncoder):
|
13 |
+
"""Base class for timm models."""
|
14 |
+
|
15 |
+
def __init__(self, model_name: str) -> None:
|
16 |
+
"""Instantiate the model class.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
model_name: Name of the model in the timm library.
|
20 |
+
"""
|
21 |
+
model, transform = self._creat_timm_model(model_name=model_name)
|
22 |
+
|
23 |
+
self._model_name = model_name
|
24 |
+
self._model = model
|
25 |
+
self._transform = transform
|
26 |
+
|
27 |
+
@property
|
28 |
+
def model_name(self) -> str:
|
29 |
+
return self._model_name
|
30 |
+
|
31 |
+
@staticmethod
|
32 |
+
def _creat_timm_model(model_name: str) -> Tuple[torch.nn.Module, Callable]:
|
33 |
+
"""Create a model and its assitiated configuration.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
model_name: Name of the model in the timm library.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
model and transformation function for input images.
|
40 |
+
"""
|
41 |
+
model = timm.create_model(model_name=model_name, pretrained=True, num_classes=0)
|
42 |
+
model.eval()
|
43 |
+
|
44 |
+
config = resolve_data_config({}, model=model)
|
45 |
+
transform = create_transform(**config)
|
46 |
+
|
47 |
+
return model, transform
|
48 |
+
|
49 |
+
def __call__(self, image: Image) -> torch.Tensor:
|
50 |
+
"""Get embeddings from an image.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
image: Image to encode
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
Embedding
|
57 |
+
"""
|
58 |
+
tensor = self._transform(image).unsqueeze(0) # transform and add batch dimension
|
59 |
+
|
60 |
+
with torch.no_grad():
|
61 |
+
embedding = self._model(tensor)
|
62 |
+
|
63 |
+
return embedding.numpy()[0]
|
fis/feature_extraction/pipeline/__pycache__/base.cpython-37.pyc
ADDED
Binary file (2.77 kB). View file
|
|
fis/feature_extraction/pipeline/__pycache__/factory.cpython-37.pyc
ADDED
Binary file (2.14 kB). View file
|
|
fis/feature_extraction/pipeline/__pycache__/pipeline.cpython-37.pyc
ADDED
Binary file (569 Bytes). View file
|
|
fis/feature_extraction/pipeline/base.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
from PIL.Image import Image as Img
|
7 |
+
|
8 |
+
from fis.feature_extraction.detection.base import BaseDetector
|
9 |
+
from fis.feature_extraction.embedding.base import BaseEncoder
|
10 |
+
|
11 |
+
|
12 |
+
class EncodingPipeline:
|
13 |
+
"""Apply the detection and embedding models to an image."""
|
14 |
+
|
15 |
+
def __init__(self, name: str, detection_model: BaseDetector, embedding_model: BaseEncoder) -> None:
|
16 |
+
"""Initialize the encoding pipeline.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
name: Name of the pipeline.
|
20 |
+
detection_model: Model used to detect the fashion items in the images.
|
21 |
+
embedding_model: Model used to generate embeddings for each detected item.
|
22 |
+
"""
|
23 |
+
self._name = name
|
24 |
+
self._detection_model = detection_model
|
25 |
+
self._embedding_model = embedding_model
|
26 |
+
|
27 |
+
def encode(self, image: str) -> List[torch.Tensor]:
|
28 |
+
"""Encode each item from an image into a embedding.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
image: path to the image.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
Embeddings for each detected item in the image.
|
35 |
+
"""
|
36 |
+
image = self._load_images(image)
|
37 |
+
bboxes = self._detection_model(image)
|
38 |
+
items = self._crop_images(image, bboxes)
|
39 |
+
|
40 |
+
embeddings = []
|
41 |
+
for item in items:
|
42 |
+
embedding = self._embedding_model(item)
|
43 |
+
embeddings.append(embedding)
|
44 |
+
|
45 |
+
return embeddings
|
46 |
+
|
47 |
+
def _load_images(self, image: Any) -> Img:
|
48 |
+
"""Read an image from disk.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
image: Path to the image on disk.
|
52 |
+
|
53 |
+
Raises:
|
54 |
+
TypeError: if the type of image is incorrect.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
PIL Image.
|
58 |
+
"""
|
59 |
+
if isinstance(image, Img):
|
60 |
+
pass
|
61 |
+
elif isinstance(image, np.ndarray):
|
62 |
+
image = Image.fromarray(image)
|
63 |
+
elif isinstance(image, str):
|
64 |
+
image = Image.open(image)
|
65 |
+
else:
|
66 |
+
raise TypeError(f"Unknown type for image: {type(image)}")
|
67 |
+
|
68 |
+
return image
|
69 |
+
|
70 |
+
def _crop_images(self, image, bboxes) -> List[Img]:
|
71 |
+
"""Crop an image based on bounding boxes.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
image: Image to crop items from.
|
75 |
+
bboxes: Bounding box containing an item.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
List of cropped images.
|
79 |
+
"""
|
80 |
+
items = []
|
81 |
+
for bbox in bboxes:
|
82 |
+
cropped_image = image.crop(bbox)
|
83 |
+
items.append(cropped_image)
|
84 |
+
|
85 |
+
return items
|
fis/feature_extraction/pipeline/factory.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fis.feature_extraction.detection.base import BaseDetector
|
2 |
+
from fis.feature_extraction.embedding.base import BaseEncoder
|
3 |
+
from fis.feature_extraction.pipeline.base import EncodingPipeline
|
4 |
+
|
5 |
+
|
6 |
+
class PipelineFactory:
|
7 |
+
"""Factory method for encoding pipelines.
|
8 |
+
|
9 |
+
Example use:
|
10 |
+
>>> from fis.feature_extraction.pipeline.factory import PipelineFactory
|
11 |
+
>>> factory = PipelineFactory()
|
12 |
+
>>> factory.register_pipeline(
|
13 |
+
... name="example_pipeline",
|
14 |
+
... detection_model=BaseDetector(),
|
15 |
+
... embedding_model=BaseEncoder()
|
16 |
+
... )
|
17 |
+
>>> pipeline = factory.get('example_pipeline')
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self):
|
21 |
+
"""Instantiate factory object."""
|
22 |
+
self._pipelines = {}
|
23 |
+
|
24 |
+
def register_pipeline(self, name: str, detection_model: BaseDetector, embedding_model: BaseEncoder) -> None:
|
25 |
+
"""Register a new pipeline to the factory.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
name: Name of the pipeline to create.
|
29 |
+
detection_model: Instance of a BaseDetector object.
|
30 |
+
embedding_model: Instance of a BaseEncoder object.
|
31 |
+
"""
|
32 |
+
pipeline = EncodingPipeline(name=name, detection_model=detection_model, embedding_model=embedding_model)
|
33 |
+
self._pipelines[name] = pipeline
|
34 |
+
|
35 |
+
def get(self, name: str) -> EncodingPipeline:
|
36 |
+
"""Get a pipeline from its name.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
name: Name of the pipeline to get.
|
40 |
+
|
41 |
+
Raises:
|
42 |
+
ValueError: If no pipeline has been registered with the given name.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
Encoding pipeline.
|
46 |
+
"""
|
47 |
+
pipeline = self._pipelines.get(name)
|
48 |
+
if not pipeline:
|
49 |
+
raise ValueError(name)
|
50 |
+
|
51 |
+
return pipeline
|
fis/feature_extraction/pipeline/pipeline.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fis.feature_extraction.detection.dummy import DummyDetector
|
2 |
+
from fis.feature_extraction.embedding.timm import TimmModel
|
3 |
+
from fis.feature_extraction.pipeline.factory import PipelineFactory
|
4 |
+
|
5 |
+
factory = PipelineFactory()
|
6 |
+
|
7 |
+
factory.register_pipeline(
|
8 |
+
name="dummy_swin_pipe",
|
9 |
+
detection_model=DummyDetector(),
|
10 |
+
embedding_model=TimmModel(model_name="swinv2_base_window8_256"),
|
11 |
+
)
|
fis/feature_extraction/run.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from datasets import Dataset
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
from fis.feature_extraction.pipeline.pipeline import factory
|
6 |
+
from fis.utils.constants import ORGANISATION
|
7 |
+
from fis.utils.s3 import list_images_from_bucket, read_image_from_s3
|
8 |
+
|
9 |
+
|
10 |
+
def make_dataset(pipeline_name: str) -> Dataset:
|
11 |
+
print("Listing images from S3...")
|
12 |
+
images = list_images_from_bucket()
|
13 |
+
images = images[:100000]
|
14 |
+
print(f"{len(images)} images to process.")
|
15 |
+
|
16 |
+
pipeline = factory.get(pipeline_name)
|
17 |
+
data = []
|
18 |
+
|
19 |
+
print("Encoding images...")
|
20 |
+
for image_name in tqdm(images):
|
21 |
+
image = read_image_from_s3(image_name)
|
22 |
+
embeddings = pipeline.encode(image)
|
23 |
+
|
24 |
+
for embedding in embeddings:
|
25 |
+
image_data = {
|
26 |
+
"path": image_name,
|
27 |
+
"embedding": embedding,
|
28 |
+
}
|
29 |
+
|
30 |
+
data.append(image_data)
|
31 |
+
|
32 |
+
df = pd.DataFrame(data)
|
33 |
+
dataset = Dataset.from_pandas(df)
|
34 |
+
|
35 |
+
return dataset
|
36 |
+
|
37 |
+
|
38 |
+
def upload_dataset(dataset: Dataset, pipeline_name: str) -> None:
|
39 |
+
print("Uploading dataset...")
|
40 |
+
repo_id = "{}/{}".format(ORGANISATION, pipeline_name)
|
41 |
+
dataset.push_to_hub(repo_id=repo_id)
|
42 |
+
|
43 |
+
|
44 |
+
def main():
|
45 |
+
pipeline_name = "dummy_swin_pipe"
|
46 |
+
dataset = make_dataset(pipeline_name=pipeline_name)
|
47 |
+
upload_dataset(dataset=dataset, pipeline_name=pipeline_name)
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
main()
|
fis/similarity_search/milvus/__pycache__/collection.cpython-37.pyc
ADDED
Binary file (1.47 kB). View file
|
|
fis/similarity_search/milvus/collection.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pymilvus import (
|
2 |
+
Collection,
|
3 |
+
CollectionSchema,
|
4 |
+
DataType,
|
5 |
+
FieldSchema,
|
6 |
+
connections,
|
7 |
+
utility,
|
8 |
+
)
|
9 |
+
|
10 |
+
connections.connect(host="127.0.0.1", port="19530")
|
11 |
+
|
12 |
+
|
13 |
+
def create_milvus_collection(collection_name: str, dim: int) -> Collection:
|
14 |
+
"""Create a Milvus collection.
|
15 |
+
|
16 |
+
Inspired by https://github.com/milvus-io/bootcamp/blob/master/solutions/reverse_image_search/1_build_image_search_engine.ipynb
|
17 |
+
|
18 |
+
Args:
|
19 |
+
collection_name: name of the Milvus collection
|
20 |
+
dim: number of dimentions
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
Milvus collection
|
24 |
+
"""
|
25 |
+
if utility.has_collection(collection_name):
|
26 |
+
utility.drop_collection(collection_name)
|
27 |
+
|
28 |
+
fields = [
|
29 |
+
FieldSchema(
|
30 |
+
name="id",
|
31 |
+
dtype=DataType.INT64,
|
32 |
+
descrition="ids",
|
33 |
+
is_primary=True,
|
34 |
+
auto_id=False,
|
35 |
+
),
|
36 |
+
FieldSchema(
|
37 |
+
name="path",
|
38 |
+
dtype=DataType.VARCHAR,
|
39 |
+
descrition="path to image",
|
40 |
+
max_length=500,
|
41 |
+
# is_primary=True,
|
42 |
+
# auto_id=False,
|
43 |
+
),
|
44 |
+
FieldSchema(
|
45 |
+
name="embedding",
|
46 |
+
dtype=DataType.FLOAT_VECTOR,
|
47 |
+
descrition="image embedding vectors",
|
48 |
+
dim=dim,
|
49 |
+
),
|
50 |
+
]
|
51 |
+
|
52 |
+
schema = CollectionSchema(fields=fields, description="reverse image search")
|
53 |
+
collection = Collection(name=collection_name, schema=schema)
|
54 |
+
|
55 |
+
index_params = {"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 2048}}
|
56 |
+
collection.create_index(field_name="embedding", index_params=index_params)
|
57 |
+
|
58 |
+
return collection
|
fis/utils/__pycache__/config.cpython-37.pyc
ADDED
Binary file (520 Bytes). View file
|
|
fis/utils/__pycache__/constants.cpython-37.pyc
ADDED
Binary file (185 Bytes). View file
|
|
fis/utils/__pycache__/s3.cpython-37.pyc
ADDED
Binary file (1.06 kB). View file
|
|
fis/utils/config.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
# Read environment variables from .env file.
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
DIR_ROOT = Path(__file__).resolve().parent.parent.parent
|
9 |
+
DIR_DATA = DIR_ROOT / "data"
|
10 |
+
DIR_SCRAPING = DIR_DATA / "scraping"
|
11 |
+
DIR_SCRAPING_IMAGES = DIR_SCRAPING / "images"
|
12 |
+
|
13 |
+
S3_BUCKET = "fashion-img-search"
|
14 |
+
S3_BUCKET_IMAGES = f"s3://{S3_BUCKET}/images/"
|
15 |
+
|
16 |
+
FILE_SCRAPING_DATA = "items.jsonl"
|
fis/utils/constants.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ORGANISATION = "FSDL-Fashion"
|
fis/utils/data/__pycache__/download_fashionpedia.cpython-37.pyc
ADDED
Binary file (1.81 kB). View file
|
|
fis/utils/data/download_fashionpedia.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import urllib.request
|
3 |
+
import zipfile
|
4 |
+
|
5 |
+
from fis.utils.config import DIR_DATA
|
6 |
+
|
7 |
+
# Download from S3
|
8 |
+
RAW_TRAIN_IMAGES = "https://s3.amazonaws.com/ifashionist-dataset/images/train2020.zip"
|
9 |
+
RAW_VAL_IMAGES = "https://s3.amazonaws.com/ifashionist-dataset/images/val_test2020.zip"
|
10 |
+
RAW_TRAIN_ANNOTATIONS = "https://s3.amazonaws.com/ifashionist-dataset/annotations/instances_attributes_train2020.json"
|
11 |
+
RAW_VAL_ANNOTATIONS = "https://s3.amazonaws.com/ifashionist-dataset/annotations/instances_attributes_val2020.json"
|
12 |
+
|
13 |
+
# to local disk
|
14 |
+
TRAIN_ANNOTATIONS = "train.json"
|
15 |
+
VAL_ANNOTATIONS = "val.json"
|
16 |
+
|
17 |
+
|
18 |
+
def download(url: str, target: str) -> None:
|
19 |
+
"""Download image and annotations.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
url: url to download from.
|
23 |
+
target: file or directory to download to.
|
24 |
+
"""
|
25 |
+
print(f"Downloading from {url}")
|
26 |
+
|
27 |
+
# Images
|
28 |
+
if url.split(".")[-1] == "zip":
|
29 |
+
path, _ = urllib.request.urlretrieve(url=url) # noqa
|
30 |
+
with zipfile.ZipFile(path, "r") as f:
|
31 |
+
f.extractall(target)
|
32 |
+
|
33 |
+
os.remove(path)
|
34 |
+
|
35 |
+
# Annotations
|
36 |
+
else:
|
37 |
+
urllib.request.urlretrieve(url=url, filename=target) # noqa
|
38 |
+
|
39 |
+
|
40 |
+
def download_fashionpedia(destination_dir: str = DIR_DATA) -> None:
|
41 |
+
"""Download the Fashionpedia dataset.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
destination_dir: directory where the dataset will be saved.
|
45 |
+
"""
|
46 |
+
os.makedirs(destination_dir, exist_ok=True)
|
47 |
+
|
48 |
+
download(url=RAW_TRAIN_ANNOTATIONS, target=os.path.join(destination_dir, TRAIN_ANNOTATIONS))
|
49 |
+
download(url=RAW_VAL_ANNOTATIONS, target=os.path.join(destination_dir, VAL_ANNOTATIONS))
|
50 |
+
|
51 |
+
download(url=RAW_TRAIN_IMAGES, target=destination_dir)
|
52 |
+
download(url=RAW_VAL_IMAGES, target=destination_dir)
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
download_fashionpedia()
|
fis/utils/s3.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import boto3
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
from fis.utils import config as cfg
|
8 |
+
|
9 |
+
s3 = boto3.resource("s3")
|
10 |
+
|
11 |
+
|
12 |
+
def list_images_from_bucket(bucket: str = cfg.S3_BUCKET) -> List[str]:
|
13 |
+
"""List jpeg images from a bucket.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
bucket: Name of the bucket. Defaults to cfg.S3_BUCKET.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
List of image names.
|
20 |
+
"""
|
21 |
+
my_bucket = s3.Bucket(bucket)
|
22 |
+
|
23 |
+
images = []
|
24 |
+
for _object in my_bucket.objects.all():
|
25 |
+
key = _object.key
|
26 |
+
if ".jpg" in key:
|
27 |
+
images.append(key)
|
28 |
+
|
29 |
+
return images
|
30 |
+
|
31 |
+
|
32 |
+
def read_image_from_s3(key, bucket: str = cfg.S3_BUCKET):
|
33 |
+
|
34 |
+
bucket = s3.Bucket(bucket)
|
35 |
+
image = bucket.Object(key)
|
36 |
+
img_data = image.get().get("Body").read()
|
37 |
+
|
38 |
+
return Image.open(io.BytesIO(img_data))
|
requirements.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|