rebase

Browse files

Files changed (13) hide show

README.md +43 -0
prepare_data.py +0 -141
requirements.txt +0 -8
run_medclip.py +94 -100
src/__pycache__/__init__.cpython-38.pyc +0 -0
src/__pycache__/configuration_medclip.cpython-38.pyc +0 -0
src/__pycache__/datasets_medclip.cpython-38.pyc +0 -0
src/__pycache__/modeling_medclip.cpython-38.pyc +0 -0
src/configuration_medclip.py +9 -9
src/datasets_medclip.py +182 -0
src/modeling_medclip.py +12 -12
tasks/prepare_roco.py +43 -0
train_model.sh +13 -10

README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+language:
+- en
+tags:
+- vision
+license: Apache 2.0
+---
+# MedCLIP
+## Model description
+## Intended uses & limitations
+#### How to use
+```python
+# You can include sample code which will be formatted
+```
+#### Limitations and bias
+Provide examples of latent issues and potential remediations.
+## Training data
+Describe the data you used to train the model.
+If you initialized it with pre-trained weights, add a link to the pre-trained model card or repository with description of the pre-training data.
+## Training procedure
+Preprocessing, hardware used, hyperparameters...
+## Eval results
+### BibTeX entry and citation info
+```bibtex
+@inproceedings{...,
+  year={2020}
+}
+```

prepare_data.py DELETED Viewed

@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-from typing import Dict, List1
-import argparse
-import json
-from functools import partial
-import pathlib
-import shutil
-import re
-from tqdm import tqdm
-from PIL import Image
-import pandas as pd
-ImageCaptionMap = Dict[str, Dict[str, str]]
-def _get_image_path(row: pd.Series, root_dir: str = '.') -> str:
-    path = [
-        root_dir,
-        'files',
-        f'p{row.subject_id}'[:3],
-        f'p{row.subject_id}',
-        f's{row.study_id}',
-        f'{row.dicom_id}.jpg'
-    ]
-    return '/'.join(path)
-def _prepare_dataframe(
-        captions: pd.DataFrame,
-        metadata: pd.DataFrame,
-        row: pd.Series
-) -> pd.Series:
-    if f's{row.study_id}' in captions.index:
-        row[captions.columns] = (
-            captions
-            .loc[f's{row.study_id}']
-            .apply(lambda text: (
-                re.sub('_+', '_', text)
-                .replace('\n', ' ')
-                .lower().rstrip('.')
-            ))
-        )
-    if row.dicom_id in metadata.index:
-        row['view_position'] = metadata.loc[row.dicom_id, 'ViewPosition']
-    return row
-def copy_image(
-        row: pd.Series,
-        target_path: pathlib.Path,
-        split: str,
-        size: int = 224
-) -> str:
-    target_img_path = target_path / split / f'{row.dicom_id}.jpg'
-    target_img_path = str(target_img_path.resolve())
-    img = Image.open(row.path)
-    img = img.resize((size, size))
-    img.save(target_img_path)
-    return target_img_path
-def generate_dataset(
-        root_dir: pathlib.Path,
-        target_dir: pathlib.Path,
-        split: str = 'validate'
-) -> ImageCaptionMap:
-    meta_dir = root_dir / 'metadata'
-    metadata = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-metadata.csv')
-    df_split = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-split.csv')
-    captions = pd.read_csv(meta_dir / 'mimic_cxr_sectioned.csv')
-    captions = captions.where(~captions.isna(), '').set_index('study')
-    metadata = metadata.set_index('dicom_id')
-    if split in df_split.split.unique():
-        current_split = df_split[df_split.split == split]
-        get_abs_path = partial(_get_image_path, root_dir=str(root_dir.resolve()))
-        current_split['path'] = current_split.apply(get_abs_path, axis=1)
-        current_split['view_position'] = ''
-        for col in captions.columns:
-            current_split[col] = ''
-        preprocess_func = partial(_prepare_dataframe, captions, metadata)
-        df = current_split.apply(preprocess_func, axis=1)
-    else:
-        raise ValueError('bad split')
-    image_path_to_caption = {}
-    (target_dir / split).mkdir(exist_ok=True, parents=True)
-    for _, element in tqdm(df.iterrows()):
-        caption = {
-            'impression': element['impression'],
-            'findings': element['findings'],
-            'last_paragraph': element['last_paragraph'],
-            'comparison': element['comparison'],
-            'view_position': element['view_position'],
-        }
-        image_path = copy_image(element, target_dir, split)
-        image_path_to_caption[image_path] = caption
-    return image_path_to_caption
-def dump_dataset(image_path_to_caption: ImageCaptionMap) -> List[str]:
-    lines = []
-    for image_path, captions in image_path_to_caption.items():
-        lines.append(json.dumps({
-            'image_path': image_path,
-            'caption': captions,
-        }))
-    return lines
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Preprocess MIMIC-CXR dataset')
-    parser.add_argument('--data_dir', description='MIMIC-CXR path')
-    parser.add_argument('--target_dir', description='output path')
-    args = parser.parse_args()
-    data_dir = pathlib.Path(args.data_dir)
-    target_dir = pathlib.Path(args.target_dir)
-    for split in ['test', 'validate', 'train']:
-        image_path_to_caption = generate_dataset(data_dir, target_dir, split)
-        lines = dump_dataset(image_path_to_caption)
-        with open(target_dir / f'{split}_dataset.json', 'w') as f:
-            f.write('\n'.join(lines))

requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-jax>=0.2.8
-jaxlib>=0.1.59
-flax>=0.3.4
-optax>=0.0.8
--f https://download.pytorch.org/whl/torch_stable.html
-torch==1.9.0+cpu
--f https://download.pytorch.org/whl/torch_stable.html
-torchvision==0.10.0+cpu

run_medclip.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://hugg
 Text models: BERT, ROBERTa (https://huggingface.co/models?filter=masked-lm)
 """
-import json
 import logging
 import os
 import sys
@@ -34,24 +33,26 @@ from pathlib import Path
 from typing import Callable, Optional
 import torch
-from torchvision.datasets import VisionDataset
-from torchvision.io import ImageReadMode, read_image
 from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
 from torchvision.transforms.functional import InterpolationMode
 from tqdm import tqdm
 import jax
 import jax.numpy as jnp
 import optax
 import transformers
 from flax import jax_utils
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, shard, shard_prng_key
-from src.modeling_medclip import FlaxHybridCLIP
 from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed
 import wandb
 logger = logging.getLogger(__name__)
 # Cache the result
@@ -69,7 +70,6 @@ else:
         "Please run pip install tensorboard to enable."
     )
 @dataclass
 class ModelArguments:
     """
@@ -119,16 +119,18 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
-    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
-    train_file: Optional[str] = field(
         default=None, metadata={"help": "The input training data file (a jsonlines file)."}
     )
-    validation_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
     )
     max_seq_length: Optional[int] = field(
-        default=72,
         metadata={
             "help": "The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded."
@@ -155,19 +157,19 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     def __post_init__(self):
-        if self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
                 assert extension == "json", "`train_file` should be a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
                 assert extension == "json", "`validation_file` should be a json file."
@@ -188,71 +190,6 @@ class Transform(torch.nn.Module):
             x = self.transforms(x)
         return x
-class ImageTextDataset(VisionDataset):
-    """
-    Dtaset for loading image-text data for tasks like CLIP training, Image Captioning.
-    Args:
-        root: (string): The root path where the dataset is stored
-        file_path: (string): Path to the file containing the image_paths and associated captions.
-            The expected format is jsonlines where each line is a json object containing to keys.
-            `image_path`: The path to the image.
-            `captions`: An `array` of captions.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.ToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-    def __init__(
-        self,
-        root: str,
-        file_path: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-    ):
-        super().__init__(root, transforms, transform, target_transform)
-        with open(file_path, "r") as f:
-            examples = [json.loads(line) for line in f.readlines()]
-        self.captions = []
-        self.image_paths = []
-        for example in examples:
-            self.captions.append(example["caption"])
-            self.image_paths.append(f'{root}/{example["image_path"]}')
-    def _load_image(self, idx: int):
-        path = self.image_paths[idx]
-        return read_image(path, mode=ImageReadMode.RGB)
-    def _load_target(self, idx):
-        sections = self.captions[idx]
-        longest_section = max(
-            filter(lambda x: isinstance(x, str), sections.values()),
-            key=len
-        )
-        return longest_section
-    def __getitem__(self, index: int):
-        image = self._load_image(index)
-        target = self._load_target(index)
-        if self.transforms is not None:
-            image, target = self.transforms(image, target)
-        return image, target
-    def __len__(self) -> int:
-        return len(self.captions)
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray
@@ -348,7 +285,7 @@ def main():
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
-    model = FlaxHybridCLIP.from_text_vision_pretrained(
         model_args.text_model_name_or_path,
         model_args.vision_model_name_or_path,
         seed=training_args.seed,
@@ -364,18 +301,51 @@ def main():
     preprocess = Transform(config.vision_config.image_size)
     preprocess = torch.jit.script(preprocess)
-    # Initialize the image-text dataset
-    train_dataset = ImageTextDataset(
-        data_args.data_dir,
-        data_args.train_file,
-        transform=preprocess,
-    )
-    eval_dataset = ImageTextDataset(
-        data_args.data_dir,
-        data_args.validation_file,
-        transform=preprocess,
-    )
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
@@ -387,8 +357,15 @@ def main():
     # Use collate function to tokenizer the text and convert the processed images to numpy
     def collate_fn(examples):
         pixel_values = torch.stack([example[0] for example in examples]).permute(0, 2, 3, 1).numpy()
-        captions = [example[1] for example in examples]
-        inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", return_tensors="np")
         batch = {
             "pixel_values": pixel_values,
@@ -406,6 +383,7 @@ def main():
         num_workers=data_args.preprocessing_num_workers,
         persistent_workers=True,
         drop_last=True,
         collate_fn=collate_fn,
     )
@@ -416,17 +394,20 @@ def main():
         num_workers=data_args.preprocessing_num_workers,
         persistent_workers=True,
         drop_last=True,
         collate_fn=collate_fn,
     )
     # Enable tensorboard only on the master node
     if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
     rng, dropout_rng = jax.random.split(rng)
     # Create learning rate schedule
     linear_decay_lr_schedule_fn = create_learning_rate_fn(
         len(train_dataset),
@@ -435,10 +416,17 @@ def main():
         training_args.warmup_steps,
         training_args.learning_rate,
     )
     # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
         b1=training_args.adam_beta1,
         b2=training_args.adam_beta2,
         eps=training_args.adam_epsilon,
@@ -473,7 +461,7 @@ def main():
         new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
-        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
         metrics = jax.lax.pmean(metrics, axis_name="batch")
         return new_state, metrics
@@ -506,6 +494,7 @@ def main():
     # Create sampling rng
     rng, input_rng = jax.random.split(rng)
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
@@ -572,6 +561,11 @@ def main():
                 commit_message=f"Saving weights and logs of epoch {epoch+1}",
             )
 if __name__ == "__main__":
-    main()

 #!/usr/bin/env python
 # coding=utf-8
+# Copyright 2021 Santiago Hincapie-Potes & The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 Text models: BERT, ROBERTa (https://huggingface.co/models?filter=masked-lm)
 """
 import logging
 import os
 import sys
 from typing import Callable, Optional
 import torch
+from torch.utils.data import ConcatDataset
 from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
 from torchvision.transforms.functional import InterpolationMode
 from tqdm import tqdm
 import jax
 import jax.numpy as jnp
+import numpy as onp
 import optax
 import transformers
 from flax import jax_utils
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, shard, shard_prng_key
 from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed
 import wandb
+from src.modeling_medclip import FlaxMedCLIP
+from src.datasets_medclip import MIMICDataset, ROCODataset
 logger = logging.getLogger(__name__)
 # Cache the result
         "Please run pip install tensorboard to enable."
     )
 @dataclass
 class ModelArguments:
     """
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
+    mimic_data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory with that containing the MIMIC-CXD dataset."})
+    mimic_train_file: Optional[str] = field(
         default=None, metadata={"help": "The input training data file (a jsonlines file)."}
     )
+    mimic_validation_file: Optional[str] = field(
         default=None,
         metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
     )
+    mimic_mode: Optional[str] = field(default=None, metadata={"help": "longest or docs"})
+    roco_data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory with that containing the ROCO dataset."})
     max_seq_length: Optional[int] = field(
+        default=128,
         metadata={
             "help": "The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded."
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
+        default=32,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     def __post_init__(self):
+        if self.mimic_train_file is None and self.mimic_validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
         else:
+            if self.mimic_train_file is not None:
+                extension = self.mimic_train_file.split(".")[-1]
                 assert extension == "json", "`train_file` should be a json file."
+            if self.mimic_validation_file is not None:
+                extension = self.mimic_validation_file.split(".")[-1]
                 assert extension == "json", "`validation_file` should be a json file."
             x = self.transforms(x)
         return x
 class TrainState(train_state.TrainState):
     dropout_rng: jnp.ndarray
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
+    model = FlaxMedCLIP.from_text_vision_pretrained(
         model_args.text_model_name_or_path,
         model_args.vision_model_name_or_path,
         seed=training_args.seed,
     preprocess = Transform(config.vision_config.image_size)
     preprocess = torch.jit.script(preprocess)
+    _train_datasets = []
+    _eval_datasets = []
+    if data_args.mimic_data_dir is not None:
+        # Initialize the image-text dataset
+        _train_datasets.append(
+            MIMICDataset(
+                data_args.mimic_data_dir,
+                data_args.mimic_train_file,
+                transform=preprocess,
+                mode=data_args.mimic_mode,
+            )
+        )
+        _eval_datasets.append(
+            MIMICDataset(
+                data_args.mimic_data_dir,
+                data_args.mimic_validation_file,
+                transform=preprocess,
+                mode=data_args.mimic_mode,
+            )
+        )
+    if data_args.roco_data_dir is not None:
+        _train_datasets.append(
+            ROCODataset(
+                data_args.roco_data_dir,
+                split="train",
+                transform=preprocess,
+            )
+        )
+        _eval_datasets.append(
+            ROCODataset(
+                data_args.roco_data_dir,
+                split="validate",
+                transform=preprocess,
+            )
+        )
+    if not _train_datasets or not _eval_datasets:
+        raise ValueError
+    else:
+        train_dataset = ConcatDataset(_train_datasets)
+        eval_dataset = ConcatDataset(_eval_datasets)
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
     # Use collate function to tokenizer the text and convert the processed images to numpy
     def collate_fn(examples):
         pixel_values = torch.stack([example[0] for example in examples]).permute(0, 2, 3, 1).numpy()
+        texts = [example[1] for example in examples]
+        inputs = tokenizer(
+            texts,
+            max_length=data_args.max_seq_length,
+            padding="max_length",
+            return_tensors="np",
+            truncation=True,
+        )
         batch = {
             "pixel_values": pixel_values,
         num_workers=data_args.preprocessing_num_workers,
         persistent_workers=True,
         drop_last=True,
+        pin_memory=True,
         collate_fn=collate_fn,
     )
         num_workers=data_args.preprocessing_num_workers,
         persistent_workers=True,
         drop_last=True,
+        pin_memory=True,
         collate_fn=collate_fn,
     )
     # Enable tensorboard only on the master node
     if has_tensorboard and jax.process_index() == 0:
+        log_dir = Path(training_args.output_dir).joinpath("logs").as_posix()
+        summary_writer = SummaryWriter(log_dir=log_dir)
     # Initialize our training
     rng = jax.random.PRNGKey(training_args.seed)
     rng, dropout_rng = jax.random.split(rng)
+    """
     # Create learning rate schedule
     linear_decay_lr_schedule_fn = create_learning_rate_fn(
         len(train_dataset),
         training_args.warmup_steps,
         training_args.learning_rate,
     )
+    """
+    cosine_decay_lr_schedule_fn = optax.cosine_decay_schedule(
+        training_args.learning_rate,
+        training_args.warmup_steps,
+        training_args.learning_rate / 1000,
+    )
     # create adam optimizer
+    adamw = optax.lamb(
+        learning_rate=cosine_decay_lr_schedule_fn, #linear_decay_lr_schedule_fn,
         b1=training_args.adam_beta1,
         b2=training_args.adam_beta2,
         eps=training_args.adam_epsilon,
         new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+        metrics = {"loss": loss, "learning_rate": cosine_decay_lr_schedule_fn(state.step)}
         metrics = jax.lax.pmean(metrics, axis_name="batch")
         return new_state, metrics
     # Create sampling rng
     rng, input_rng = jax.random.split(rng)
+    #jax.profiler.start_trace(log_dir)
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
         # ======================== Training ================================
                 commit_message=f"Saving weights and logs of epoch {epoch+1}",
             )
+    #jax.profiler.stop_trace()
+    return model, params
 if __name__ == "__main__":
+    model, params = main()
+    model.save_pretrained("model", params=params)

src/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (132 Bytes). View file

src/__pycache__/configuration_medclip.cpython-38.pyc ADDED Viewed

Binary file (4.14 kB). View file

src/__pycache__/datasets_medclip.cpython-38.pyc ADDED Viewed

Binary file (6.04 kB). View file

src/__pycache__/modeling_medclip.cpython-38.pyc ADDED Viewed

Binary file (12.9 kB). View file

src/configuration_medclip.py CHANGED Viewed

@@ -7,10 +7,10 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-class HybridCLIPConfig(PretrainedConfig):
     r"""
-    :class:`HybridCLIPConfig` is the configuration class to store the configuration of a
-    :class:`~HybridCLIPModel`. It is used to instantiate HybridCLIPModel model according to the specified arguments,
     defining the text model and vision model configs.
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
@@ -28,13 +28,13 @@ class HybridCLIPConfig(PretrainedConfig):
     Examples::
-        >>> from transformers import BertConfig, CLIPConfig, HybridCLIPConfig, FlaxHybridCLIP
         >>> # Initializing a BERT and CLIP configuration
         >>> config_text = BertConfig()
         >>> config_vision = CLIPConfig()
-        >>> config = HybridCLIPConfig.from_text_vision_configs(config_text, config_vision, projection_dim=512)
         >>> # Initializing a BERT and CLIPVision model
         >>> model = EncoderDecoderModel(config=config)
@@ -47,8 +47,8 @@ class HybridCLIPConfig(PretrainedConfig):
         >>> model.save_pretrained('my-model')
         >>> # loading model and config from pretrained folder
-        >>> encoder_decoder_config = HybridCLIPConfig.from_pretrained('my-model')
-        >>> model = FlaxHybridCLIP.from_pretrained('my-model', config=encoder_decoder_config)
     """
     model_type = "hybrid-clip"
@@ -84,11 +84,11 @@ class HybridCLIPConfig(PretrainedConfig):
     @classmethod
     def from_text_vision_configs(cls, text_config: PretrainedConfig, vision_config: PretrainedConfig, **kwargs):
         r"""
-        Instantiate a :class:`HybridCLIPConfig` (or a derived class) from text model configuration and
         vision model configuration.
         Returns:
-            :class:`HybridCLIPConfig`: An instance of a configuration object
         """
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

 logger = logging.get_logger(__name__)
+class MedCLIPConfig(PretrainedConfig):
     r"""
+    :class:`MedCLIPConfig` is the configuration class to store the configuration of a
+    :class:`~MedCLIPModel`. It is used to instantiate HybridCLIPModel model according to the specified arguments,
     defining the text model and vision model configs.
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     Examples::
+        >>> from transformers import BertConfig, CLIPConfig, MedCLIPConfig, FlaxMedCLIP
         >>> # Initializing a BERT and CLIP configuration
         >>> config_text = BertConfig()
         >>> config_vision = CLIPConfig()
+        >>> config = MedCLIPConfig.from_text_vision_configs(config_text, config_vision, projection_dim=512)
         >>> # Initializing a BERT and CLIPVision model
         >>> model = EncoderDecoderModel(config=config)
         >>> model.save_pretrained('my-model')
         >>> # loading model and config from pretrained folder
+        >>> encoder_decoder_config = MedCLIPConfig.from_pretrained('my-model')
+        >>> model = FlaxMedCLIP.from_pretrained('my-model', config=encoder_decoder_config)
     """
     model_type = "hybrid-clip"
     @classmethod
     def from_text_vision_configs(cls, text_config: PretrainedConfig, vision_config: PretrainedConfig, **kwargs):
         r"""
+        Instantiate a :class:`MedCLIPConfig` (or a derived class) from text model configuration and
         vision model configuration.
         Returns:
+            :class:`MedCLIPConfig`: An instance of a configuration object
         """
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

src/datasets_medclip.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# coding=utf-8
+# Copyright 2021 Santiago Hincapie-Potes & The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+import json
+import random
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union
+from torchvision.datasets import VisionDataset
+from torchvision.io import ImageReadMode, read_image
+class MIMICDataset(VisionDataset):
+    """
+    Dataset for loading image-text data for tasks like CLIP training, Image Captioning.
+    Args:
+        root: (string): The root path where the dataset is stored
+        file_path: (string): Path to the file containing the image_paths and associated captions.
+            The expected format is jsonlines where each line is a json object containing to keys.
+            `image_path`: The path to the image.
+            `captions`: An `array` of captions.
+        mode: (string): target format:
+            * 'longest': return the longest sections
+            * 'docs': return findings and impressions
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.ToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(
+        self,
+        root: str,
+        file_path: str,
+        mode: str = 'longest',
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ):
+        super().__init__(root, transforms, transform, target_transform)
+        root = Path(root)
+        if not mode in {'longest', 'docs'}:
+            raise ValueError('Invalid mode')
+        self.mode = mode
+        with open(root / file_path, "r") as f:
+            examples = [json.loads(line) for line in f.readlines()]
+        self.captions = []
+        self.image_paths = []
+        for example in examples:
+            img_path = root / example["image_path"]
+            if img_path.exists():
+                self.captions.append(example["caption"])
+                self.image_paths.append(str(img_path))
+    def _load_image(self, idx: int):
+        path = self.image_paths[idx]
+        return read_image(path, mode=ImageReadMode.RGB)
+    def _load_target(self, idx) -> str:
+        sections = self.captions[idx]
+        if self.mode == 'docs':
+            _collection = []
+            if 'impression' in sections:
+                _collection.append(sections['impression'])
+            if 'findings' in sections:
+                _collection.append(sections['findings'])
+            if len(_collection) == 1:
+                output = _collection[0]
+            if len(_collection) == 2:
+                output = random.choice(_collection)
+        if self.mode == 'longest' or len(_collection) == 0:
+            longest_section = max(
+                filter(lambda x: isinstance(x, str), sections.values()),
+                key=len
+            )
+            output = longest_section
+        return output
+    def __getitem__(self, index: int):
+        image = self._load_image(index)
+        target = self._load_target(index)
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+        return image, target
+    def __len__(self) -> int:
+        return len(self.captions)
+class ROCODataset(VisionDataset):
+    """
+    Dataset for loading image-text data for tasks like CLIP training, Image Captioning.
+    Args:
+        root: (string): The root path where the dataset is stored
+        file_path: (string): Path to the file containing the image_paths and associated captions.
+            The expected format is jsonlines where each line is a json object containing to keys.
+            `image_path`: The path to the image.
+            `captions`: An `array` of captions.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.ToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(
+        self,
+        root: str,
+        split: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ):
+        super().__init__(root, transforms, transform, target_transform)
+        root = Path(root) / f"{split}/radiology/"
+        file_path = f"{split}.csv"
+        self.captions = []
+        self.image_paths = []
+        with open((root / file_path).resolve(), 'r') as buf:
+            csv_reader = csv.reader(buf)
+            next(csv_reader) # skip header
+            for row in csv_reader:
+                if len(row) == 3:
+                    _, fname, caption = row
+                else:
+                    print(row)
+                self.captions.append(caption.strip())
+                self.image_paths.append(str(root / 'images' / fname.strip()))
+    def _load_image(self, idx: int):
+        path = self.image_paths[idx]
+        return read_image(path, mode=ImageReadMode.RGB)
+    def _load_target(self, idx: int) -> str:
+        return self.captions[idx]
+    def __getitem__(self, index: int):
+        image = self._load_image(index)
+        target = self._load_target(index)
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+        return image, target
+    def __len__(self) -> int:
+        return len(self.captions)

src/modeling_medclip.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ from typing import Optional, Tuple
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
-from src.configuration_medclip import HybridCLIPConfig
 from flax.core.frozen_dict import FrozenDict
 from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel
 from transformers.modeling_flax_utils import FlaxPreTrainedModel
@@ -29,8 +29,8 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-class FlaxHybridCLIPModule(nn.Module):
-    config: HybridCLIPConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self):
@@ -122,13 +122,13 @@ class FlaxHybridCLIPModule(nn.Module):
         )
-class FlaxHybridCLIP(FlaxPreTrainedModel):
-    config_class = HybridCLIPConfig
-    module_class = FlaxHybridCLIPModule
     def __init__(
         self,
-        config: HybridCLIPConfig,
         input_shape: Optional[Tuple] = None,
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
@@ -347,14 +347,14 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
         Example::
-            >>> from transformers import FlaxHybridCLIP
             >>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
             >>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
-            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('bert-base-uncased', 'openai/clip-vit-base-patch32')
             >>> # saving model after fine-tuning
             >>> model.save_pretrained("./bert-clip")
             >>> # load fine-tuned model
-            >>> model = FlaxHybridCLIP.from_pretrained("./bert-clip")
         """
         kwargs_text = {
@@ -404,7 +404,7 @@ class FlaxHybridCLIP(FlaxPreTrainedModel):
         # instantiate config with corresponding kwargs
         dtype = kwargs.pop("dtype", jnp.float32)
-        config = HybridCLIPConfig.from_text_vision_configs(text_model.config, vision_model.config, **kwargs)
         # init model
         model = cls(config, *model_args, dtype=dtype, **kwargs)

 # coding=utf-8
+# Copyright 2021 Santiago Hincapie-Potes & The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+from src.configuration_medclip import MedCLIPConfig
 from flax.core.frozen_dict import FrozenDict
 from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel
 from transformers.modeling_flax_utils import FlaxPreTrainedModel
 logger = logging.get_logger(__name__)
+class FlaxMedCLIPModule(nn.Module):
+    config: MedCLIPConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         )
+class FlaxMedCLIP(FlaxPreTrainedModel):
+    config_class = MedCLIPConfig
+    module_class = FlaxMedCLIPModule
     def __init__(
         self,
+        config: MedCLIPConfig,
         input_shape: Optional[Tuple] = None,
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         Example::
+            >>> from transformers import FlaxMedCLIP
             >>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
             >>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
+            >>> model = FlaxMedCLIP.from_text_vision_pretrained('bert-base-uncased', 'openai/clip-vit-base-patch32')
             >>> # saving model after fine-tuning
             >>> model.save_pretrained("./bert-clip")
             >>> # load fine-tuned model
+            >>> model = FlaxMedCLIP.from_pretrained("./bert-clip")
         """
         kwargs_text = {
         # instantiate config with corresponding kwargs
         dtype = kwargs.pop("dtype", jnp.float32)
+        config = MedCLIPConfig.from_text_vision_configs(text_model.config, vision_model.config, **kwargs)
         # init model
         model = cls(config, *model_args, dtype=dtype, **kwargs)

tasks/prepare_roco.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import csv
+from pathlib import Path
+import torchvision
+def main(roco_root: str):
+    root = Path(roco_root)
+    check_images(
+        root / 'train/radiology', 'traindata.csv', 'train.csv'
+    )
+    check_images(
+        root / 'validate/radiology', 'valdata.csv', 'validate.csv'
+    )
+    check_images(
+        root / 'test/radiology', 'testdata.csv', 'test.csv'
+    )
+def check_images(split_dir: Path, input_csv: str, target_output: str):
+    with open(split_dir / input_csv, 'r') as buf:
+        csv_reader = csv.reader(buf)
+        next(csv_reader, None)
+        filtered_csv = []
+        for row in csv_reader:
+            image_path = split_dir / 'images' / row[1]
+            try:
+                torchvision.io.read_image(str(image_path))
+            except:
+                continue
+            filtered_csv.append(row)
+    with open(split_dir / target_output, 'w') as csvfile:
+        spamwriter = csv.writer(csvfile)
+        for row in filtered_csv:
+            spamwriter.writerow(row)
+if __name__ == '__main__':
+    main('/home/shpotes/medclip/data/roco-dataset')
+{mode:full,isActive:false}

train_model.sh CHANGED Viewed

@@ -1,15 +1,18 @@
 python run_medclip.py \
-    --output_dir model \
     --text_model_name_or_path="allenai/scibert_scivocab_uncased" \
     --vision_model_name_or_path="openai/clip-vit-base-patch32" \
     --tokenizer_name="allenai/scibert_scivocab_uncased" \
-    --data_dir="/home/shared/data/mimic-cxr" \
-    --train_file="/home/shared/data/mimic-cxr/train_dataset.json" \
-    --validation_file="/home/shared/data/mimic-cxr/validate_dataset.json" \
     --do_train --do_eval \
-    --num_train_epochs="40" --max_seq_length 512 \
-    --per_device_train_batch_size="64" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 32 \

 python run_medclip.py \
+    --output_dir "flax-community/medclip" \
+    --overwrite_output_dir \
     --text_model_name_or_path="allenai/scibert_scivocab_uncased" \
     --vision_model_name_or_path="openai/clip-vit-base-patch32" \
     --tokenizer_name="allenai/scibert_scivocab_uncased" \
+    --mimic_data_dir="/home/shpotes/medclip/data/mimic-cxr/" \
+    --mimic_train_file="train_dataset.json" \
+    --mimic_validation_file="validate_dataset.json" \
+    --mimic_mode="docs" \
+    --roco_data_dir="/home/shpotes/medclip/data/roco-dataset/" \
     --do_train --do_eval \
+    --num_train_epochs="20" \
+    --preprocessing_num_workers=32 \
+    --per_device_train_batch_size=64 \
+    --per_device_eval_batch_size=64 \
+    --warmup_steps=3000 \
+    --learning_rate="3e-4"