Spaces:

altndrr
/

vic

Runtime error

App Files Files Community

altndrr commited on Jun 5, 2023

Commit

a3ee979

•

1 Parent(s): 3070a83

Add first version

Browse files

Files changed (21) hide show

.gitignore +151 -0
README.md +4 -3
app.py +78 -0
artifacts/examples/basketball.jpg +0 -0
artifacts/examples/cassowary.jpg +0 -0
artifacts/examples/colosseum.jpg +0 -0
artifacts/examples/desk.jpg +0 -0
artifacts/examples/kitchen.jpg +0 -0
artifacts/examples/log.csv +11 -0
artifacts/examples/monkey.jpg +0 -0
artifacts/examples/park.jpg +0 -0
artifacts/examples/ramen.jpg +0 -0
artifacts/examples/sagrada.jpg +0 -0
artifacts/examples/venice.jpg +0 -0
artifacts/models/databases/.gitkeep +0 -0
artifacts/models/retrieval/indices.json +3 -0
flagged/.gitkeep +0 -0
requirements.txt +10 -0
src/nn.py +330 -0
src/retrieval.py +42 -0
src/transforms.py +506 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,151 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# pytype type checking
+.pytype/
+# Pyre type checker
+.pyre/
+### VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+**/.vscode
+# JetBrains
+.idea/
+# Data & Models
+*.h5
+*.tar
+*.tar.gz
+# Template
+/artifacts/models/databases/*/

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: Vic
 emoji: 🌍
-colorFrom: gray
-colorTo: gray
 sdk: gradio
 sdk_version: 3.33.1
 app_file: app.py
 pinned: false
 ---

 ---
+title: Vocabulary-free Image Classification
 emoji: 🌍
+colorFrom: green
+colorTo: yellow
 sdk: gradio
 sdk_version: 3.33.1
+python_version: 3.9
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import Optional
+import gradio as gr
+import torch
+from src.nn import CaSED
+PAPER_TITLE = "Vocabulary-free Image Classification"
+PAPER_DESCRIPTION = """
+<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
+    <a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;">
+        <img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/>
+    </a>
+    <a href="https://arxiv.org/abs/2306.00917" style="margin-right: 0.5rem;">
+        <img src="https://img.shields.io/badge/paper-arXiv%3A2306.00917-B31B1B.svg"/>
+    </a>
+    <a href="https://altndrr.github.io/vic/" style="margin-right: 0.5rem;">
+        <img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/>
+    </a>
+</div>
+Vocabulary-free Image Classification aims to assign a class to an image *without* prior knowledge
+on the list of class names, thus operating on the semantic class space that contains all the
+possible concepts. Our proposed method CaSED finds the best matching category within the
+unconstrained semantic space by multimodal data from large vision-language databases. We first
+retrieve the semantically most similar captions from a database, from which we extract a set of
+candidate categories by applying text parsing and filtering techniques. We further score the
+candidates using the multimodal aligned representation of the large pre-trained VLM, *i.e.* CLIP,
+to obtain the best-matching category, using *alpha* as a hyperparameter to control the trade-off
+between the visual and textual similarity.
+"""
+PAPER_URL = "https://arxiv.org/abs/2306.00917"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = CaSED().to(DEVICE).eval()
+def vic(filename: str, alpha: Optional[float] = None):
+    # get the outputs of the model
+    vocabulary, scores = model(filename, alpha=alpha)
+    confidences = dict(zip(vocabulary, scores))
+    return confidences
+def resize_image(image, max_size: int = 256):
+    """Resize image to max_size keeping the aspect ratio."""
+    width, height = image.size
+    if width > height:
+        ratio = width / height
+        new_width = max_size * ratio
+        new_height = max_size
+    else:
+        ratio = height / width
+        new_width = max_size
+        new_height = max_size * ratio
+    return image.resize((int(new_width), int(new_height)))
+demo = gr.Interface(
+    fn=vic,
+    inputs=[
+        gr.Image(type="filepath", label="input"),
+        gr.Slider(0.0, 1.0, value=0.5, label="alpha"),
+    ],
+    outputs=[gr.Label(num_top_classes=5, label="output")],
+    title=PAPER_TITLE,
+    description=PAPER_DESCRIPTION,
+    article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
+    examples="./artifacts/examples/",
+    allow_flagging='never',
+    theme=gr.themes.Soft()
+)
+demo.launch(share=False)

artifacts/examples/basketball.jpg ADDED Viewed

artifacts/examples/cassowary.jpg ADDED Viewed

artifacts/examples/colosseum.jpg ADDED Viewed

artifacts/examples/desk.jpg ADDED Viewed

artifacts/examples/kitchen.jpg ADDED Viewed

artifacts/examples/log.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+image_fp
+basketball.jpg
+cassowary.jpg
+colosseum.jpg
+desk.jpg
+kitchen.jpg
+monkey.jpg
+park.jpg
+ramen.jpg
+sagrada.jpg
+venice.jpg

artifacts/examples/monkey.jpg ADDED Viewed

artifacts/examples/park.jpg ADDED Viewed

artifacts/examples/ramen.jpg ADDED Viewed

artifacts/examples/sagrada.jpg ADDED Viewed

artifacts/examples/venice.jpg ADDED Viewed

artifacts/models/databases/.gitkeep ADDED Viewed

File without changes

artifacts/models/retrieval/indices.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "ViT-L-14_CC12M": "./artifacts/models/databases/cc12m/vit-l-14/"
+}

flagged/.gitkeep ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.0.1
+torchvision==0.15.2
+faiss-cpu==1.7.4
+flair==0.12.2
+gradio==3.33.1
+gdown==4.4.0
+inflect==6.0.4
+nltk==3.8.1
+open_clip_torch==2.20.0
+transformers==4.26.1

src/nn.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import json
+import tarfile
+from pathlib import Path
+from typing import Optional
+import faiss
+import gdown
+import numpy as np
+import open_clip
+import torch
+from open_clip.transformer import Transformer
+from PIL import Image
+from src.retrieval import ArrowMetadataProvider, meta_to_dict
+from src.transforms import TextCompose, default_vocabulary_transforms
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+RETRIEVAL_DATABASES = {
+    "cc12m": "https://drive.google.com/uc?id=1HyM4mnKSxF0sqzAe-KZL8y-cQWRPiuXn&confirm=t",
+}
+class CaSED(torch.nn.Module):
+    """Torch module for Category Search from External Databases (CaSED).
+    Args:
+        index_name (str): Name of the faiss index to use.
+        vocabulary_transforms (TextCompose): List of transforms to apply to the vocabulary.
+        model_name (str): Name of the CLIP model to use. Defaults to "ViT-L-14".
+        pretrained (str): Pretrained weights to use for the CLIP model. Defaults to "openai".
+    Extra hparams:
+        alpha (float): Weight for the average of the image and text predictions. Defaults to 0.5.
+        artifact_dir (str): Path to the directory where the databases are stored. Defaults to
+            "artifacts/".
+        retrieval_num_results (int): Number of results to return. Defaults to 10.
+        vocabulary_prompt (str): Prompt to use for the vocabulary. Defaults to "{}".
+        tau (float): Temperature to use for the classifier. Defaults to 1.0.
+    """
+    def __init__(
+        self,
+        index_name: str = "ViT-L-14_CC12M",
+        vocabulary_transforms: TextCompose = default_vocabulary_transforms(),
+        model_name: str = "ViT-L-14",
+        pretrained: str = "openai",
+        vocabulary_prompt: str = "{}",
+        **kwargs,
+    ):
+        super().__init__()
+        self._prev_vocab_words = None
+        self._prev_used_prompts = None
+        self._prev_vocab_words_z = None
+        model, _, preprocess = open_clip.create_model_and_transforms(
+            model_name, pretrained=pretrained, device="cpu"
+        )
+        tokenizer = open_clip.get_tokenizer(model_name)
+        self.tokenizer = tokenizer
+        self.preprocess = preprocess
+        kwargs["alpha"] = kwargs.get("alpha", 0.5)
+        kwargs["artifact_dir"] = kwargs.get("artifact_dir", "artifacts/")
+        kwargs["retrieval_num_results"] = kwargs.get("retrieval_num_results", 10)
+        vocabulary_prompt = kwargs.get("vocabulary_prompt", "{}")
+        kwargs["vocabulary_prompts"] = [vocabulary_prompt]
+        kwargs["tau"] = kwargs.get("tau", 1.0)
+        self.hparams = kwargs
+        language_encoder = LanguageTransformer(
+            model.transformer,
+            model.token_embedding,
+            model.positional_embedding,
+            model.ln_final,
+            model.text_projection,
+            model.attn_mask,
+        )
+        scale = model.logit_scale.exp().item()
+        classifier = NearestNeighboursClassifier(scale=scale, tau=self.hparams["tau"])
+        self.index_name = index_name
+        self.vocabulary_transforms = vocabulary_transforms
+        self.vision_encoder = model.visual
+        self.language_encoder = language_encoder
+        self.classifier = classifier
+        # download databases
+        self.prepare_data()
+        # load faiss indices
+        indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
+        indices_fp = indices_list_dir / "indices.json"
+        self.indices = json.load(open(indices_fp, "r"))
+        # load faiss indices and metadata providers
+        self.resources = {}
+        for name, index_fp in self.indices.items():
+            text_index_fp = Path(index_fp) / "text.index"
+            metadata_fp = Path(index_fp) / "metadata/"
+            text_index = faiss.read_index(
+                str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
+            )
+            metadata_provider = ArrowMetadataProvider(metadata_fp)
+            self.resources[name] = {
+                "device": DEVICE,
+                "model": model_name,
+                "text_index": text_index,
+                "metadata_provider": metadata_provider,
+            }
+    def prepare_data(self):
+        """Download data if needed."""
+        databases_path = Path(self.hparams["artifact_dir"]) / "models" / "databases"
+        for name, url in RETRIEVAL_DATABASES.items():
+            database_path = Path(databases_path, name)
+            if database_path.exists():
+                continue
+            # download data
+            target_path = Path(databases_path, name + ".tar.gz")
+            try:
+                gdown.download(url, str(target_path), quiet=False)
+                tar = tarfile.open(target_path, "r:gz")
+                tar.extractall(target_path.parent)
+                tar.close()
+                target_path.unlink()
+            except FileNotFoundError:
+                print(f"Could not download {url}.")
+                print(f"Please download it manually and place it in {target_path.parent}.")
+    @torch.no_grad()
+    def query_index(self, sample_z: torch.Tensor) -> torch.Tensor:
+        # get the index
+        resources = self.resources[self.index_name]
+        text_index = resources["text_index"]
+        metadata_provider = resources["metadata_provider"]
+        # query the index
+        sample_z = sample_z.squeeze(0)
+        sample_z = sample_z / sample_z.norm(dim=-1, keepdim=True)
+        query_input = sample_z.cpu().detach().numpy().tolist()
+        query = np.expand_dims(np.array(query_input).astype("float32"), 0)
+        distances, idxs, _ = text_index.search_and_reconstruct(
+            query, self.hparams["retrieval_num_results"]
+        )
+        results = idxs[0]
+        nb_results = np.where(results == -1)[0]
+        nb_results = nb_results[0] if len(nb_results) > 0 else len(results)
+        indices = results[:nb_results]
+        distances = distances[0][:nb_results]
+        if len(distances) == 0:
+            return []
+        # get the metadata
+        results = []
+        metadata = metadata_provider.get(indices[:20], ["caption"])
+        for key, (d, i) in enumerate(zip(distances, indices)):
+            output = {}
+            meta = None if key + 1 > len(metadata) else metadata[key]
+            if meta is not None:
+                output.update(meta_to_dict(meta))
+            output["id"] = i.item()
+            output["similarity"] = d.item()
+            results.append(output)
+        # get the captions only
+        vocabularies = [result["caption"] for result in results]
+        return vocabularies
+    @torch.no_grad()
+    def encode_vocabulary(self, vocabulary: list, use_prompts: bool = False) -> torch.Tensor:
+        """Encode a vocabulary.
+        Args:
+            vocabulary (list): List of words.
+        """
+        # check if vocabulary has changed
+        if vocabulary == self._prev_vocab_words and use_prompts == self._prev_used_prompts:
+            return self._prev_vocab_words_z
+        # tokenize vocabulary
+        classes = [c.replace("_", " ") for c in vocabulary]
+        prompts = self.hparams["vocabulary_prompts"] if use_prompts else ["{}"]
+        texts_views = [[p.format(c) for c in classes] for p in prompts]
+        tokenized_texts_views = [
+            torch.cat([self.tokenizer(prompt) for prompt in class_prompts])
+            for class_prompts in texts_views
+        ]
+        tokenized_texts_views = torch.stack(tokenized_texts_views).to(DEVICE)
+        # encode vocabulary
+        T, C, _ = tokenized_texts_views.shape
+        texts_z_views = self.language_encoder(tokenized_texts_views.view(T * C, -1))
+        texts_z_views = texts_z_views.view(T, C, -1)
+        texts_z_views = texts_z_views / texts_z_views.norm(dim=-1, keepdim=True)
+        # cache vocabulary
+        self._prev_vocab_words = vocabulary
+        self._prev_used_prompts = use_prompts
+        self._prev_vocab_words_z = texts_z_views
+        return texts_z_views
+    @torch.no_grad()
+    def forward(self, image_fp: str, alpha: Optional[float] = None) -> torch.Tensor():
+        image = self.preprocess(Image.open(image_fp)).unsqueeze(0)
+        image_z = self.vision_encoder(image.to(DEVICE))
+        # get the vocabulary
+        vocabulary = self.query_index(image_z)
+        # generate a single text embedding from the unfiltered vocabulary
+        unfiltered_vocabulary_z = self.encode_vocabulary(vocabulary).squeeze(0)
+        text_z = unfiltered_vocabulary_z.mean(dim=0)
+        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+        text_z = text_z.unsqueeze(0)
+        # filter the vocabulary, embed it, and get its mean embedding
+        vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
+        vocabulary_z = self.encode_vocabulary(vocabulary, use_prompts=True)
+        mean_vocabulary_z = vocabulary_z.mean(dim=0)
+        mean_vocabulary_z = mean_vocabulary_z / mean_vocabulary_z.norm(dim=-1, keepdim=True)
+        # get the image and text predictions
+        image_p = self.classifier(image_z, vocabulary_z)
+        text_p = self.classifier(text_z, vocabulary_z)
+        # average the image and text predictions
+        alpha = alpha or self.hparams["alpha"]
+        sample_p = alpha * image_p + (1 - alpha) * text_p
+        # get the scores
+        sample_p = sample_p.cpu()
+        scores = sample_p[0].tolist()
+        del image_z, unfiltered_vocabulary_z, text_z, vocabulary_z, mean_vocabulary_z
+        del image_p, text_p, sample_p
+        return vocabulary, scores
+class NearestNeighboursClassifier(torch.nn.Module):
+    """Nearest neighbours classifier.
+    It computes the similarity between the query and the supports using the
+    cosine similarity and then applies a softmax to obtain the logits.
+    Args:
+        scale (float): Scale for the logits of the query. Defaults to 1.0.
+        tau (float): Temperature for the softmax. Defaults to 1.0.
+    """
+    def __init__(self, scale: float = 1.0, tau: float = 1.0):
+        super().__init__()
+        self.scale = scale
+        self.tau = tau
+    def forward(self, query: torch.Tensor, supports: torch.Tensor):
+        query = query / query.norm(dim=-1, keepdim=True)
+        supports = supports / supports.norm(dim=-1, keepdim=True)
+        if supports.dim() == 2:
+            supports = supports.unsqueeze(0)
+        Q, _ = query.shape
+        N, C, _ = supports.shape
+        supports = supports.mean(dim=0)
+        supports = supports / supports.norm(dim=-1, keepdim=True)
+        similarity = self.scale * query @ supports.T
+        similarity = similarity / self.tau if self.tau != 1.0 else similarity
+        logits = similarity.softmax(dim=-1)
+        return logits
+class LanguageTransformer(torch.nn.Module):
+    """Language Transformer for CLIP.
+    Args:
+        transformer (Transformer): Transformer model.
+        token_embedding (torch.nn.Embedding): Token embedding.
+        positional_embedding (torch.nn.Parameter): Positional embedding.
+        ln_final (torch.nn.LayerNorm): Layer norm.
+        text_projection (torch.nn.Parameter): Text projection.
+    """
+    def __init__(
+        self,
+        model: Transformer,
+        token_embedding: torch.nn.Embedding,
+        positional_embedding: torch.nn.Parameter,
+        ln_final: torch.nn.LayerNorm,
+        text_projection: torch.nn.Parameter,
+        attn_mask: torch.Tensor,
+    ):
+        super().__init__()
+        self.transformer = model
+        self.token_embedding = token_embedding
+        self.positional_embedding = positional_embedding
+        self.ln_final = ln_final
+        self.text_projection = text_projection
+        self.register_buffer("attn_mask", attn_mask, persistent=False)
+    def forward(self, text: torch.Tensor) -> torch.Tensor:
+        cast_dtype = self.transformer.get_cast_dtype()
+        """Forward pass for the text encoder."""
+        x = self.token_embedding(text).to(cast_dtype)
+        x = x + self.positional_embedding.to(cast_dtype)
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x, attn_mask=self.attn_mask)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x

src/retrieval.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pathlib import Path
+import pyarrow as pa
+import numpy as np
+class ArrowMetadataProvider:
+    """The arrow metadata provider provides metadata from contiguous ids using arrow.
+    Code taken from:
+        https://github.dev/rom1504/clip-retrieval
+    """
+    def __init__(self, arrow_folder):
+        arrow_files = [str(a) for a in sorted(Path(arrow_folder).glob("**/*")) if a.is_file()]
+        self.table = pa.concat_tables(
+            [
+                pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()
+                for arrow_file in arrow_files
+            ]
+        )
+    def get(self, ids, cols=None):
+        """implement the get method from the arrow metadata provide, get metadata from ids"""
+        if cols is None:
+            cols = self.table.schema.names
+        else:
+            cols = list(set(self.table.schema.names) & set(cols))
+        t = pa.concat_tables([self.table[i:(i + 1)] for i in ids])
+        return t.select(cols).to_pandas().to_dict("records")
+def meta_to_dict(meta):
+    """Convert a metadata list to a dictionary."""
+    output = {}
+    for k, v in meta.items():
+        if isinstance(v, bytes):
+            v = v.decode()
+        elif type(v).__module__ == np.__name__:
+            v = v.item()
+        output[k] = v
+    return output

src/transforms.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import re
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Union, cast
+import inflect
+import nltk
+import numpy as np
+import PIL.Image
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from flair.data import Sentence
+from flair.models import SequenceTagger
+__all__ = [
+    "DynamicResize",
+    "DropFileExtensions",
+    "DropNonAlpha",
+    "DropShortWords",
+    "DropSpecialCharacters",
+    "DropTokens",
+    "DropURLs",
+    "DropWords",
+    "FilterPOS",
+    "FrequencyMinWordCount",
+    "FrequencyTopK",
+    "ReplaceSeparators",
+    "ToRGBTensor",
+    "ToLowercase",
+    "ToSingular",
+]
+class BaseTextTransform(ABC):
+    """Base class for string transforms."""
+    @abstractmethod
+    def __call__(self, text: str):
+        raise NotImplementedError
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class DynamicResize(T.Resize):
+    """Resize the input PIL Image to the given size.
+    Extends the torchvision Resize transform to dynamically evaluate the second dimension of the
+    output size based on the aspect ratio of the first input image.
+    """
+    def forward(self, img):
+        if isinstance(self.size, int):
+            _, h, w = F.get_dimensions(img)
+            aspect_ratio = w / h
+            side = self.size
+            if aspect_ratio < 1.0:
+                self.size = int(side / aspect_ratio), side
+            else:
+                self.size = side, int(side * aspect_ratio)
+        return super().forward(img)
+class DropFileExtensions(BaseTextTransform):
+    """Remove file extensions from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove file extensions from.
+        """
+        text = re.sub(r"\.\w+", "", text)
+        return text
+class DropNonAlpha(BaseTextTransform):
+    """Remove non-alpha words from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove non-alpha words from.
+        """
+        text = re.sub(r"[^a-zA-Z\s]", "", text)
+        return text
+class DropShortWords(BaseTextTransform):
+    """Remove short words from the input text.
+    Args:
+        min_length (int): Minimum length of words to keep.
+    """
+    def __init__(self, min_length) -> None:
+        super().__init__()
+        self.min_length = min_length
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove short words from.
+        """
+        text = " ".join([word for word in text.split() if len(word) >= self.min_length])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(min_length={self.min_length})"
+class DropSpecialCharacters(BaseTextTransform):
+    """Remove special characters from the input text.
+    Special characters are defined as any character that is not a word character, whitespace,
+    hyphen, period, apostrophe, or ampersand.
+    """
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove special characters from.
+        """
+        text = re.sub(r"[^\w\s\-\.\'\&]", "", text)
+        return text
+class DropTokens(BaseTextTransform):
+    """Remove tokens from the input text.
+    Tokens are defined as strings enclosed in angle brackets, e.g. <token>.
+    """
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove tokens from.
+        """
+        text = re.sub(r"<[^>]+>", "", text)
+        return text
+class DropURLs(BaseTextTransform):
+    """Remove URLs from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove URLs from.
+        """
+        text = re.sub(r"http\S+", "", text)
+        return text
+class DropWords(BaseTextTransform):
+    """Remove words from the input text.
+    It is case-insensitive and supports singular and plural forms of the words.
+    """
+    def __init__(self, words: list[str]) -> None:
+        super().__init__()
+        self.words = words
+        self.pattern = r"\b(?:{})\b".format("|".join(words))
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove words from.
+        """
+        text = re.sub(self.pattern, "", text, flags=re.IGNORECASE)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(pattern={self.pattern})"
+class FilterPOS(BaseTextTransform):
+    """Filter words by POS tags.
+    Args:
+        tags (list): List of POS tags to remove.
+        engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
+        keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
+    """
+    def __init__(self, tags: list, engine: str = "nltk", keep_compound_nouns: bool = True) -> None:
+        super().__init__()
+        self.tags = tags
+        self.engine = engine
+        self.keep_compound_nouns = keep_compound_nouns
+        if engine == "nltk":
+            nltk.download("averaged_perceptron_tagger", quiet=True)
+            nltk.download("punkt", quiet=True)
+            self.tagger = lambda x: nltk.pos_tag(nltk.word_tokenize(x))
+        elif engine == "flair":
+            self.tagger = SequenceTagger.load("flair/pos-english-fast").predict
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove words with specific POS tags from.
+        """
+        if self.engine == "nltk":
+            word_tags = self.tagger(text)
+            text = " ".join([word for word, tag in word_tags if tag not in self.tags])
+        elif self.engine == "flair":
+            sentence = Sentence(text)
+            self.tagger(sentence)
+            text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
+        if self.keep_compound_nouns:
+            compound_nouns = []
+            if self.engine == "nltk":
+                for i in range(len(word_tags) - 1):
+                    if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
+                        # if they are the same word, skip
+                        if word_tags[i][0] == word_tags[i + 1][0]:
+                            continue
+                        compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
+                        compound_nouns.append(compound_noun)
+            elif self.engine == "flair":
+                for i in range(len(sentence.tokens) - 1):
+                    if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
+                        # if they are the same word, skip
+                        if sentence.tokens[i].text == sentence.tokens[i + 1].text:
+                            continue
+                        compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
+                        compound_nouns.append(compound_noun)
+            text = " ".join([text, " ".join(compound_nouns)])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(tags={self.tags}, engine={self.engine})"
+class FrequencyMinWordCount(BaseTextTransform):
+    """Keep only words that occur more than a minimum number of times in the input text.
+    If the threshold is too strong and no words pass the threshold, the threshold is reduced to
+    the most frequent word.
+    Args:
+        min_count (int): Minimum number of occurrences of a word to keep.
+    """
+    def __init__(self, min_count) -> None:
+        super().__init__()
+        self.min_count = min_count
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove infrequent words from.
+        """
+        if self.min_count <= 1:
+            return text
+        words = text.split()
+        word_counts = {word: words.count(word) for word in words}
+        # if nothing passes the threshold, reduce the threshold to the most frequent word
+        max_word_count = max(word_counts.values() or [0])
+        min_count = max_word_count if self.min_count > max_word_count else self.min_count
+        text = " ".join([word for word in words if word_counts[word] >= min_count])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(min_count={self.min_count})"
+class FrequencyTopK(BaseTextTransform):
+    """Keep only the top k most frequent words in the input text.
+    In case of a tie, all words with the same count as the last word are kept.
+    Args:
+        top_k (int): Number of top words to keep.
+    """
+    def __init__(self, top_k: int) -> None:
+        super().__init__()
+        self.top_k = top_k
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove infrequent words from.
+        """
+        if self.top_k < 1:
+            return text
+        words = text.split()
+        word_counts = {word: words.count(word) for word in words}
+        top_words = sorted(word_counts, key=word_counts.get, reverse=True)
+        # in case of a tie, keep all words with the same count
+        top_words = top_words[: self.top_k]
+        top_words = [word for word in top_words if word_counts[word] == word_counts[top_words[-1]]]
+        text = " ".join([word for word in words if word in top_words])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(top_k={self.top_k})"
+class ReplaceSeparators(BaseTextTransform):
+    """Replace underscores and dashes with spaces."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to replace separators in.
+        """
+        text = re.sub(r"[_\-]", " ", text)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class RemoveDuplicates(BaseTextTransform):
+    """Remove duplicate words from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove duplicate words from.
+        """
+        text = " ".join(list(set(text.split())))
+        return text
+class TextCompose:
+    """Compose several transforms together.
+    It differs from the torchvision.transforms.Compose class in that it applies the transforms to
+    a string instead of a PIL Image or Tensor. In addition, it automatically join the list of
+    input strings into a single string and splits the output string into a list of words.
+    Args:
+        transforms (list): List of transforms to compose.
+    """
+    def __init__(self, transforms: list[BaseTextTransform]) -> None:
+        self.transforms = transforms
+    def __call__(self, text: Union[str, list[str]]) -> Any:
+        if isinstance(text, list):
+            text = " ".join(text)
+        for t in self.transforms:
+            text = t(text)
+        return text.split()
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+class ToRGBTensor(T.ToTensor):
+    """Convert a `PIL Image` or `numpy.ndarray` to tensor.
+    Compared with the torchvision `ToTensor` transform, it converts images with a single channel to
+    RGB images. In addition, the conversion to tensor is done only if the input is not already a
+    tensor.
+    """
+    def __call__(self, pic: Union[PIL.Image.Image, np.ndarray, torch.Tensor]):
+        """
+        Args:
+            pic (PIL Image | numpy.ndarray | torch.Tensor): Image to be converted to tensor.
+        """
+        img = pic if isinstance(pic, torch.Tensor) else F.to_tensor(pic)
+        img = cast(torch.Tensor, img)
+        if img.shape[0] == 1:
+            img = img.repeat(3, 1, 1)
+        return img
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class ToLowercase(BaseTextTransform):
+    """Convert text to lowercase."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to convert to lowercase.
+        """
+        text = text.lower()
+        return text
+class ToSingular(BaseTextTransform):
+    """Convert plural words to singular form."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.transform = inflect.engine().singular_noun
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to convert to singular form.
+        """
+        words = text.split()
+        for i, word in enumerate(words):
+            if not word.endswith("s"):
+                continue
+            if word[-2:] in ["ss", "us", "is"]:
+                continue
+            if word[-3:] in ["ies", "oes"]:
+                continue
+            words[i] = self.transform(word) or word
+        text = " ".join(words)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+def default_preprocess(size: Optional[int] = None) -> T.Compose:
+    """Preprocess input images with preprocessing transforms.
+    Args:
+        size (int): Size to resize image to.
+    """
+    transforms = []
+    if size is not None:
+        transforms.append(DynamicResize(size, interpolation=T.InterpolationMode.BICUBIC))
+    transforms.append(ToRGBTensor())
+    transforms = T.Compose(transforms)
+    return transforms
+def default_vocabulary_transforms() -> TextCompose:
+    """Preprocess input text with preprocessing transforms."""
+    words_to_drop = [
+        "image",
+        "photo",
+        "picture",
+        "thumbnail",
+        "logo",
+        "symbol",
+        "clipart",
+        "portrait",
+        "painting",
+        "illustration",
+        "icon",
+        "profile",
+    ]
+    pos_tags = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VBG", "VBN"]
+    transforms = []
+    transforms.append(DropTokens())
+    transforms.append(DropURLs())
+    transforms.append(DropSpecialCharacters())
+    transforms.append(DropFileExtensions())
+    transforms.append(ReplaceSeparators())
+    transforms.append(DropShortWords(min_length=3))
+    transforms.append(DropNonAlpha())
+    transforms.append(ToLowercase())
+    transforms.append(ToSingular())
+    transforms.append(DropWords(words=words_to_drop))
+    transforms.append(FrequencyMinWordCount(min_count=2))
+    transforms.append(FilterPOS(tags=pos_tags, engine="flair", keep_compound_nouns=False))
+    transforms.append(RemoveDuplicates())
+    transforms = TextCompose(transforms)
+    return transforms