AleksanderObuchowski commited on Nov 2, 2024

Commit

5ceacbc

verified ·

1 Parent(s): 9ee70d2

Add files using upload-large-folder tool

Browse files

Files changed (43) hide show

.idea/.gitignore +8 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +204 -0
2024.09.27/config.yaml +211 -0
2024.09.27/language_model/clip_tokenizer_4.16.2/merges.txt +0 -0
2024.09.27/language_model/clip_tokenizer_4.16.2/special_tokens_map.json +27 -0
2024.09.27/language_model/clip_tokenizer_4.16.2/tokenizer_config.json +38 -0
2024.09.27/language_model/clip_tokenizer_4.16.2/vocab.json +0 -0
MedImageInsight/Distributed/Utils.py +344 -0
MedImageInsight/Distributed/__init__.py +6 -0
MedImageInsight/ImageDataLoader/__init__.py +8 -0
MedImageInsight/ImageDataLoader/blob_storage.py +244 -0
MedImageInsight/ImageDataLoader/build.py +260 -0
MedImageInsight/ImageDataLoader/constants.py +85 -0
MedImageInsight/ImageDataLoader/languages/__init__.py +0 -0
MedImageInsight/ImageDataLoader/languages/prompt_engineering.py +101 -0
MedImageInsight/ImageDataLoader/transforms/__init__.py +1 -0
MedImageInsight/ImageDataLoader/transforms/autoaugment.py +447 -0
MedImageInsight/ImageDataLoader/transforms/build.py +261 -0
MedImageInsight/ImageDataLoader/transforms/threeaugment.py +54 -0
MedImageInsight/ImageDataLoader/tsv.py +351 -0
MedImageInsight/ImageDataLoader/tsv_file.py +290 -0
MedImageInsight/ImageDataLoader/zipdata.py +98 -0
MedImageInsight/ImageEncoder/__init__.py +8 -0
MedImageInsight/ImageEncoder/build.py +13 -0
MedImageInsight/ImageEncoder/coswin.py +779 -0
MedImageInsight/ImageEncoder/davit_v1.py +727 -0
MedImageInsight/ImageEncoder/registry.py +18 -0
MedImageInsight/LangEncoder/__init__.py +13 -0
MedImageInsight/LangEncoder/build.py +108 -0
MedImageInsight/LangEncoder/registry.py +18 -0
MedImageInsight/LangEncoder/transformer.py +210 -0
MedImageInsight/UniCLModel.py +293 -0
MedImageInsight/Utils/Arguments.py +134 -0
MedImageInsight/Utils/GeneraUtils.py +263 -0
MedImageInsight/Utils/GlobalExceptHook.py +61 -0
MedImageInsight/Utils/MPIAdapter.py +147 -0
MedImageInsight/Utils/Utils.py +141 -0
MedImageInsight/Utils/__init__.py +7 -0
MedImageInsight/__init__.py +9 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 virtualenv at ~/medatlas/.venv" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/medatlas.iml" filepath="$PROJECT_DIR$/.idea/medatlas.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,204 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="9ec92c76-0e74-4c49-9687-c62749296b88" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="FlaskConsoleOptions" custom-start-script="import sys; print('Python %s on %s' % (sys.version, sys.platform)); sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo, NoAppException&#10;for module in [&quot;main.py&quot;, &quot;wsgi.py&quot;, &quot;app.py&quot;]:&#10;    try: locals().update(ScriptInfo(app_import_path=module, create_app=None).load_app().make_shell_context()); print(&quot;\nFlask App: %s&quot; % app.import_name); break&#10;    except NoAppException: pass">
+    <envs>
+      <env key="FLASK_APP" value="app" />
+    </envs>
+    <option name="myCustomStartScript" value="import sys; print('Python %s on %s' % (sys.version, sys.platform)); sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo, NoAppException&#10;for module in [&quot;main.py&quot;, &quot;wsgi.py&quot;, &quot;app.py&quot;]:&#10;    try: locals().update(ScriptInfo(app_import_path=module, create_app=None).load_app().make_shell_context()); print(&quot;\nFlask App: %s&quot; % app.import_name); break&#10;    except NoAppException: pass" />
+    <option name="myEnvs">
+      <map>
+        <entry key="FLASK_APP" value="app" />
+      </map>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="HighlightingSettingsPerFile">
+    <setting file="file://$PROJECT_DIR$/.venv/lib/python3.8/site-packages/safetensors/torch.py" root0="SKIP_INSPECTION" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 3
+}</component>
+  <component name="ProjectId" id="2nytZGYw1NHCwZYyKjVZHmbmsFp" />
+  <component name="ProjectLevelVcsManager">
+    <ConfirmationsSetting value="1" id="Add" />
+  </component>
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "Python.example.executor": "Debug",
+    "Python.explainability.executor": "Run",
+    "Python.main.executor": "Run",
+    "Python.medimageinsightmodel.executor": "Run",
+    "Python.push_to_hub.executor": "Run",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "RunOnceActivity.git.unshallow": "true",
+    "git-widget-placeholder": "master",
+    "last_opened_file_path": "/home/olek/medatlas/2024.09.27/vision_model",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RdControllerToolWindowsLayoutState" isNewUi="true">
+    <layout>
+      <window_info id="Bookmarks" side_tool="true" />
+      <window_info id="Merge Requests" />
+      <window_info id="Commit_Guest" show_stripe_button="false" />
+      <window_info id="Pull Requests" />
+      <window_info id="Learn" />
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.16933593" />
+      <window_info id="Commit" order="1" weight="0.25" />
+      <window_info id="Structure" order="2" side_tool="true" weight="0.25" />
+      <window_info anchor="bottom" id="Database Changes" />
+      <window_info anchor="bottom" id="TypeScript" />
+      <window_info anchor="bottom" id="TODO" />
+      <window_info anchor="bottom" id="File Transfer" />
+      <window_info anchor="bottom" id="Version Control" order="0" />
+      <window_info anchor="bottom" id="Problems" order="1" />
+      <window_info anchor="bottom" id="Problems View" order="2" />
+      <window_info active="true" anchor="bottom" id="Terminal" order="3" visible="true" weight="0.3795139" />
+      <window_info anchor="bottom" id="Services" order="4" />
+      <window_info anchor="bottom" id="Python Packages" order="5" weight="0.1" />
+      <window_info anchor="bottom" id="Debug" order="6" weight="0.29618055" />
+      <window_info anchor="bottom" id="Python Console" order="7" weight="0.1" />
+      <window_info anchor="bottom" id="HfCacheToolWindow" order="8" weight="0.44131944" />
+      <window_info anchor="bottom" id="Run" order="9" weight="0.6490499" />
+      <window_info anchor="bottom" id="Find" order="10" weight="0.33020833" />
+      <window_info anchor="right" id="Endpoints" />
+      <window_info anchor="right" id="Coverage" side_tool="true" />
+      <window_info anchor="right" id="SciView" />
+      <window_info anchor="right" content_ui="combo" id="Notifications" order="0" weight="0.25" />
+      <window_info anchor="right" id="AIAssistant" order="1" weight="0.25" />
+      <window_info anchor="right" id="Database" order="2" weight="0.25" />
+      <window_info anchor="right" id="Gradle" order="3" weight="0.25" />
+      <window_info anchor="right" id="Maven" order="4" weight="0.25" />
+      <window_info anchor="right" id="CodeGPT" order="5" weight="0.30566406" />
+      <window_info anchor="right" id="Plots" order="6" weight="0.1" />
+    </layout>
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/2024.09.27/vision_model" />
+      <recent name="$PROJECT_DIR$" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+      <recent name="$PROJECT_DIR$/MedImageInsight/ImageEncoder" />
+      <recent name="$PROJECT_DIR$/MedImageInsights" />
+    </key>
+  </component>
+  <component name="RunManager">
+    <configuration name="main" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+      <module name="medatlas" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/main.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="medimageinsightmodel" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="medatlas" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/medimageinsightmodel.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.medimageinsightmodel" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-js-predefined-d6986cc7102b-bed05e336f61-JavaScript-PY-243.21155.22" />
+        <option value="bundled-python-sdk-5ff8a29a62a8-ca77fbc60dd9-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-243.21155.22" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="9ec92c76-0e74-4c49-9687-c62749296b88" name="Changes" comment="" />
+      <created>1729957197525</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1729957197525</updated>
+      <workItem from="1729957199944" duration="8141000" />
+      <workItem from="1729970018757" duration="142000" />
+      <workItem from="1729970174785" duration="25000" />
+      <workItem from="1729970270429" duration="53000" />
+      <workItem from="1729970419018" duration="9867000" />
+      <workItem from="1730030408588" duration="2251000" />
+      <workItem from="1730037237796" duration="27583000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/medatlas$explainability.coverage" NAME="explainability Coverage Results" MODIFIED="1730155021389" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/medatlas$push_to_hub.coverage" NAME="push_to_hub Coverage Results" MODIFIED="1730031227719" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/medatlas$example.coverage" NAME="example Coverage Results" MODIFIED="1730041646094" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/medatlas$main.coverage" NAME="main Coverage Results" MODIFIED="1730153590829" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/medatlas$medimageinsightmodel.coverage" NAME="medimageinsightmodel Coverage Results" MODIFIED="1730037368621" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+  </component>
+</project>

2024.09.27/config.yaml ADDED Viewed

	@@ -0,0 +1,211 @@

+##################
+# Trainer settings
+##################
+TASK: UniCLTask
+NAME: 'Example Eval Configuration'
+SAVE_TIMER_LOG: true
+# TUTORIAL STEP 1: CHOOSE SAVE DIR
+SAVE_DIR: ''
+LOG_EVERY: 10
+LOGLEVEL_OVERRIDE: INFO
+LOG_GPU_MEM: true
+RESUME: False
+RESET_DATA_LOADER: false
+FP16: true
+ZERO_STAGE: 0
+DEEPSPEED: false
+# ZERO_STAGE: 1
+AMP: PYTORCH
+# USE_APEX_DDP: false
+# USE_APEX_AMP: false
+# USE_HIT: false
+FIND_UNUSED_PARAMETERS: false
+SAVE_PER_OPTIM_STEPS: 500
+EVAL_PER_OPTIM_STEPS: 250
+EVAL_AT_START: False
+# SAVE_PER_UPDATE_NUM: -1
+# EVAL_PER_UPDATE_NUM: 0 # 0: do evaluation when saving checkpoint, -1: don't do evaluation
+NO_AUTO_LR_SCALING: true
+GRAD_CLIPPING: 1.0 #0.07
+SET_SAMPLER_EPOCH: true
+DONT_LOAD_MODEL: true
+user_dir: "./MainzVision" # lower case due to it is used in mainz as such
+##################
+# Task settings
+##################
+VERBOSE: true
+WORKERS: 6
+PIN_MEMORY: true
+IMAGE_ENCODER:
+  NAME: davit_v1
+  NUM_CLASSES: 0
+  #IMAGE_SIZE: [384, 384]
+  IMAGE_SIZE: [480, 480]
+  LOAD_PRETRAINED: true
+  PRETRAINED: ''
+  PRETRAINED_LAYERS: '*'
+  IMAGE_MEAN: [0.485, 0.456, 0.406]
+  IMAGE_STD: [0.229, 0.224, 0.225]
+  SPEC:
+    DROP_RATE: 0.1
+    DROP_PATH_RATE: 0.2
+    PATCH_SIZE: [7, 3, 3, 3]
+    PATCH_STRIDE: [4, 2, 2, 2]
+    PATCH_PADDING: [3, 1, 1, 1]
+    PATCH_PRENORM: [false, true, true, true]
+    DIM_EMBED: [256, 512, 1024, 2048]
+    NUM_HEADS: [8, 16, 32, 64]
+    NUM_GROUPS: [8, 16, 32, 64]
+    DEPTHS: [1, 1, 9, 1]
+    WINDOW_SIZE: 12
+    ENABLE_CHECKPOINT: true
+LANG_ENCODER:
+  NAME: transformer
+  LOAD_PRETRAINED: false
+  PRETRAINED: ''
+  PRETRAINED_LAYERS: '*'
+  TOKENIZER: clip
+  CONTEXT_LENGTH: 77
+  WIDTH: 1024
+  HEADS: 16
+  LAYERS: 16
+  AUTOGRESSIVE: false
+UNICL_MODEL:
+  DIM_PROJECTION: 1024
+  GATHER_TENSORS: true
+  LOAD_PRETRAINED: true
+  # TUTORIAL STEP 2: CHOOSE MODEL PATH
+  PRETRAINED: ''
+  PRETRAINED_LAYERS: '*'
+AUG:
+  MIXUP_PROB: 0.0
+  MIXUP: 0.8
+  MIXCUT: 1.0
+  MIXCUT_MINMAX: []
+  MIXUP_SWITCH_PROB: 0.5
+  MIXUP_MODE: 'batch'
+  SCALE: [0.8, 1.0]
+  RATIO: [0.75, 1.3333333]
+  INTERPOLATION: 'bicubic'
+  TORCHVISION_AUG:
+    AUTO_AUGMENT: ta_wide
+    RE_PROB: 0.25
+    HFLIP: 0.0
+    VFLIP: 0.0
+LOSS:
+  LOSS: UniCL
+DATASET:
+  DATASET: 'image_text_pairs_v2'
+  TEXT_FORMAT: 'json'
+  ROOT: ''
+  TRAIN_SET: 'mimic_cxr_v2-chestxray14-chexpertv4-irma2009_v2-rsnaboneage-mura-bingmedicalfewshot'
+  DATA_FORMAT: 'tsv'
+  SAMPLER: 'default'
+  LOADER: 'default'
+  TOKEN_FILE: ''
+  #PROMPT_ENGINEERING: False
+  #SAMPLER: 'chunk'
+  #LOADER: 'azcopy'
+  #TOKEN_FILE: 'cliptrainingpairs.txt'
+  #TEST_SET: 'MarsAtrain'
+# TUTORIAL STEP 3: CHOOSE ALL BELOW EVAL PATHS (THESE ARE ALL OPTIONAL EXTRA EVALS)
+# Note how one eval is ZIP format and the other is TSV format.
+EVALDATASET_LTCXR_S100_N100_TEXT_CLASSIFIER:
+  TEXT_FORMAT: json
+  FORMAT: 'zip'
+  SPLIT: 'NIH-CXR-LT'
+  ZIP_FILE: ''
+  ZIP_MAP_FILE: ''
+  LABEL_FILE: ''
+  IMAGE_TSV: ''
+  TEXT_TSV: ''
+  CWEIGHT_FILE: ''
+  ZS_MODE: 2
+  ZS_WEIGHT: 1.0
+  KNN: 100
+#  CLASSIFICATION_SETS: ['NIH-CXR-LT']
+#  NUM_CLASSES: [20]
+# TUTORIAL STEP 4: SET THE DEFAULT ZEROSHOT EVAL (THIS IS THE MANDATORY EVAL)
+ZEROSHOT_EVAL_DATASET:
+  FORMAT: 'zip'
+  SPLIT: 'NIH-CXR-LT'
+  ZIP_FILE: ''
+  ZIP_MAP_FILE: ''
+  LABEL_FILE: ''
+EVALUATION_SPLITS: ['cls-zeroshot-eval']
+TEST:
+  BATCH_SIZE_PER_GPU: 8
+  MODEL_FILE: ''
+  CENTER_CROP: false
+TRAIN:
+  BATCH_SIZE_TOTAL: 1024
+  BATCH_SIZE_PER_GPU: 16
+  SHUFFLE: true
+WEIGHT_SMOOTHING:
+  decay: 0.999
+  use_cpu: False
+  eval_smoothed_weight: True
+START_LEARNING_RATE: 0.00001
+# MAX_NUM_EPOCHS: 2
+MAX_NUM_EPOCHS: 100
+OPTIMIZER: AdamW # adam
+OPTIMIZER_PARAMS:
+  weight_decay: 0.2 #0.1
+CUSTOMIZED_PARAMS_CONF:
+  NO_WEIGHT_DECAY_MODULES: ['dw', 'norm']
+  WEIGHT_DECAY_PATTERNS:
+    "\\.bias$": 0.0
+    "logit_scale": 0.0
+    "positional_embedding": 0.0
+    "token_embedding": 0.0
+LR_SCHEDULER: TimmScheduler
+LR_SCHEDULER_PARAMS:
+  sched: cosine
+  warmup_steps: 5
+  warmup_lr: 0.000000001
+  min_lr: 0.000000001
+# GRADIENT_ACCUMULATE_STEP will be updated by:
+# BATCH_SIZE_TOTAL // (BATCH_SIZE_PER_GPU * world_size)
+GRADIENT_ACCUMULATE_STEP: -1

2024.09.27/language_model/clip_tokenizer_4.16.2/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

2024.09.27/language_model/clip_tokenizer_4.16.2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false
+  },
+  "pad_token": "<|endoftext|>"
+}

2024.09.27/language_model/clip_tokenizer_4.16.2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "errors": "replace",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false,
+    "__type": "AddedToken"
+  },
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false,
+    "__type": "AddedToken"
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "single_word": false,
+    "lstrip": false,
+    "rstrip": false,
+    "normalized": true,
+    "special": false,
+    "__type": "AddedToken"
+  },
+  "pad_token": "<|endoftext|>",
+  "add_prefix_space": false,
+  "do_lower_case": true,
+  "name_or_path": "openai/clip-vit-base-patch32",
+  "model_max_length": 77,
+  "special_tokens_map_file": "/home/ncodella/.cache/huggingface/transformers/18a566598f286c9139f88160c99f84eec492a26bd22738fa9cb44d5b7e0a5c76.cce1206abbad28826f000510f22f354e53e66a97f7c23745a7dfe27609cc07f5",
+  "tokenizer_file": "/home/ncodella/.cache/huggingface/transformers/7811def0c53be25ba790cb67ac785669b508a8d1cf8c912b8ac046c5f08aee68.20428ea8b6821af2719b760af844a371643ff49f255c73285f6ea448e15597fe",
+  "tokenizer_class": "CLIPTokenizer"
+}

2024.09.27/language_model/clip_tokenizer_4.16.2/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

MedImageInsight/Distributed/Utils.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import logging
+import os
+import pickle
+import requests
+import tenacity
+import time
+import shutil
+import torch
+import torch.distributed as dist
+from PIL import Image
+from torchvision.utils import make_grid
+from fvcore.nn import FlopCountAnalysis
+from fvcore.nn import flop_count_table
+from fvcore.nn import flop_count_str
+logger = logging.getLogger(__name__)
+NORM_MODULES = [
+    torch.nn.BatchNorm1d,
+    torch.nn.BatchNorm2d,
+    torch.nn.BatchNorm3d,
+    torch.nn.SyncBatchNorm,
+    # NaiveSyncBatchNorm inherits from BatchNorm2d
+    torch.nn.GroupNorm,
+    torch.nn.InstanceNorm1d,
+    torch.nn.InstanceNorm2d,
+    torch.nn.InstanceNorm3d,
+    torch.nn.LayerNorm,
+    torch.nn.LocalResponseNorm,
+]
+def register_norm_module(cls):
+    NORM_MODULES.append(cls)
+    return cls
+def is_main_process():
+    rank = 0
+    if 'OMPI_COMM_WORLD_SIZE' in os.environ:
+        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+    return rank == 0
+@torch.no_grad()
+def analysis_model(model, dump_input, verbose=False):
+    model.eval()
+    flops = FlopCountAnalysis(model, dump_input)
+    total = flops.total()
+    model.train()
+    params_total = sum(p.numel() for p in model.parameters())
+    params_learned = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    logger.info(f"flop count table:\n {flop_count_table(flops)}")
+    if verbose:
+        logger.info(f"flop count str:\n {flop_count_str(flops)}")
+    logger.info(f"  Total flops: {total / 1000 / 1000:.3f}M,")
+    logger.info(f"  Total params: {params_total / 1000 / 1000:.3f}M,")
+    logger.info(f"  Learned params: {params_learned / 1000 / 1000:.3f}M")
+    return total, flop_count_table(flops), flop_count_str(flops)
+def gather_tensors(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor)
+        for _ in range(int(os.environ['WORLD_SIZE']))
+    ]
+    dist.all_gather(tensors_gather, tensor, async_op=False)
+    # need to do this to restore propagation of the gradients
+    tensors_gather[int(os.environ['RANK'])] = tensor
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def is_valid_url(url):
+    try:
+        from urllib import parse
+        return parse.urlparse(str(url)).scheme != ''
+    except Exception:
+        return False
+@tenacity.retry(stop=tenacity.stop_after_attempt(3))
+def download_file(url, filepath):
+    logger.info(f'Downloading from {url} to {filepath.absolute()}.')
+    with requests.get(url, stream=True, allow_redirects=True, timeout=60) as r:
+        if r.status_code > 200:
+            raise RuntimeError(f'Failed in downloading from {url}, status code {r.status_code}.')
+        with open(filepath, 'wb') as f:
+            shutil.copyfileobj(r.raw, f, length=4194304)
+class DistributionGridFactory:
+    """
+    DistributionGrid Factory for helping create, cache and share the DistributionGrid based on the usage.
+    The DistributionGrid con be shared cross modules only the when this 3 conditions:
+        1. expert parallel group size
+        2. expert parallel replica group size,
+    are the same.
+    """
+    distribution_grid_cache = {}
+    @classmethod
+    def get_distribution_grid(cls,
+                              expert_parallel_group_size,
+                              expert_parallel_replica_group_size,
+                              ddp_type):
+        """
+        Get the DistributionGrid by the conditions.
+        Args:
+            expert_parallel_group_size: expert parallel group size
+            expert_parallel_replica_group_size: expert parallel replica group size
+            ddp_type: distributed data parallel type. "DDP" of the recipe, only allow ddp_type is "MAINZ", "OSS" or "ShardedDDP".
+        Returns: new created DistributionGrid or shared DistributionGrid.
+        Notes: Currently get_distribution_grid only support "DDP" is "MAINZ", "OSS" or "ShardedDDP".
+        """
+        # TODO:  Support cases that "DDP" is "FSDP".
+        # For "FSDP", we use the DG of self.opt['fsdp_expert_grid'] which is initialize in DistributedTrainer directly.
+        ddp_type = ddp_type.upper()
+        assert ddp_type in ["MAINZ", "OSS", "SHARDEDDDP"], f'DistributionGrid Factory only support "DDP" is "MAINZ",' \
+                                             f' "OSS" or "ShardedDDP".' \
+                                             f' But currently "DDP" is {ddp_type}'
+        cached_distributed_grid = cls.distribution_grid_cache.get(
+            (expert_parallel_group_size, expert_parallel_replica_group_size), None)
+        if cached_distributed_grid is not None:
+            return cached_distributed_grid
+        else:
+            from ort_moe.grids import DistributionGrid
+            distributed_grid = DistributionGrid(expert_parallel_group_size=expert_parallel_group_size,
+                                                expert_parallel_replica_group_size=expert_parallel_replica_group_size)
+            cls.distribution_grid_cache[expert_parallel_group_size,
+                                        expert_parallel_replica_group_size] = distributed_grid
+            return distributed_grid
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    if world_size == 1:
+        return
+    def _send_and_wait(r):
+        if rank == r:
+            tensor = torch.tensor(0, device="cuda")
+        else:
+            tensor = torch.tensor(1, device="cuda")
+        dist.broadcast(tensor, r)
+        while tensor.item() == 1:
+            time.sleep(1)
+    _send_and_wait(0)
+    # now sync on the main process
+    _send_and_wait(1)
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def all_gather_cpu(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    def _get_global_gloo_group():
+        """
+        Return a process group based on gloo backend, containing all the ranks
+        The result is cached.
+        """
+        if dist.get_backend() == "nccl":
+            return dist.new_group(backend="gloo")
+        else:
+            return dist.group.WORLD
+    if get_world_size() == 1:
+        return [data]
+    group = _get_global_gloo_group()  # use CPU group by default, to reduce GPU RAM usage.
+    world_size = dist.get_world_size(group)
+    if world_size == 1:
+        return [data]
+    output = [None for _ in range(world_size)]
+    dist.all_gather_object(output, data, group=group)
+    return output
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+def broadcast_data(data):
+    if not torch.distributed.is_initialized():
+        return data
+    rank = dist.get_rank()
+    if rank == 0:
+        data_tensor = torch.tensor(data + [0], device="cuda")
+    else:
+        data_tensor = torch.tensor(data + [1], device="cuda")
+    torch.distributed.broadcast(data_tensor, 0)
+    while data_tensor.cpu().numpy()[-1] == 1:
+        time.sleep(1)
+    return data_tensor.cpu().numpy().tolist()[:-1]
+def reduce_sum(tensor):
+    if get_world_size() <= 1:
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    return tensor
+def save_result(result, filename):
+    output_folder = os.path.dirname(filename)
+    basename = os.path.splitext(os.path.basename(filename))[0]
+    os.makedirs(output_folder, exist_ok=True)
+    if isinstance(result, torch.Tensor) and result.ndim in [3,4]:
+        if result.ndim==3 and result.size(0) not in [1,3]:
+            result = make_grid(result.unsqueeze(1))
+        elif result.ndim==4:
+            result = make_grid(result)
+        else:
+            result = make_grid([result])
+        im = Image.fromarray(result.clamp_(0, 255).permute(1, 2, 0).to(torch.uint8).numpy())
+        im.save(os.path.join(output_folder, '{}.png'.format(basename)))
+    else:
+        torch.save(result, os.path.join(output_folder, '{}.pth'.format(basename)))

MedImageInsight/Distributed/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .Utils import analysis_model
+from .Utils import is_main_process
+from .Utils import gather_tensors
+from .Utils import register_norm_module
+from .Utils import NORM_MODULES
+from .Utils import DistributionGridFactory

MedImageInsight/ImageDataLoader/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .build import build_dataloader
+#from .build import build_multitask_dataloader
+from .transforms import build_transforms
+#from .imagenet.real_labels import RealLabelsImagenet
+from .constants import IMAGENET_CLASSES
+from .constants import IMAGENET_DEFAULT_TEMPLATES
+from .zipdata import ZipData
+#from .vision_dataset import VDImageTextDataset, MultiClassTorchDatasetWrapper

MedImageInsight/ImageDataLoader/blob_storage.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+import time
+import shutil
+import logging
+import subprocess
+import os.path as op
+from typing import List
+from collections import OrderedDict
+import torch.distributed as distributed
+logger = logging.getLogger(__name__)
+DEFAULT_AZCOPY_PATH = 'azcopy/azcopy'
+def disk_usage(path: str) -> float:
+    stat = shutil.disk_usage(path)
+    return stat.used / stat.total
+def is_download_successful(stdout: str) -> bool:
+    for line in stdout.split('\n'):
+        if line == "Number of Transfers Failed: 0":
+            return True
+    logger.info("Azcopy message:\n %s" % stdout)
+    return False
+def ensure_directory(path):
+    """Check existence of the given directory path. If not, create a new directory.
+    Args:
+        path (str): path of a given directory.
+    """
+    if path == '' or path == '.':
+        return
+    if path is not None and len(path) > 0:
+        assert not op.isfile(path), '{} is a file'.format(path)
+        if not op.exists(path) and not op.islink(path):
+            os.makedirs(path, exist_ok=True)
+        # we should always check if it succeeds.
+        assert op.isdir(op.abspath(path)), path
+class LRU(OrderedDict):
+    def __init__(self, maxsize=3):
+        self.maxsize = maxsize
+    def __getitem__(self, key):
+        value = super().__getitem__(key)
+        self.move_to_end(key)
+        return value
+    def __setitem__(self, key, value):
+        if key in self:
+            if self[key] is not None:
+                self[key].close()
+                self.move_to_end(key)
+        logger.debug('=> Cache {}'.format(key))
+        super().__setitem__(key, value)
+        if len(self) > self.maxsize:
+            oldest = next(iter(self))
+            if self[oldest] is not None:
+                self[oldest].close()
+            logger.debug('=> Purged {}'.format(oldest))
+            del self[oldest]
+class BlobStorage(OrderedDict):
+    """ Pseudo Blob Storage manager
+    The registered blobs are maintained in a LRU cache.
+    Limit size, evicting the least recently looked-up key when full.
+    https://docs.python.org/3/library/collections.html#collections.OrderedDict
+    Input argument:
+        sas_token (str): path to SAS token.
+    """
+    def __init__(self,
+                 is_train: bool,
+                 sas_token_path: str = None,
+                 azcopy_path: str = None,
+                 *args, **kwds):
+        super().__init__(*args, **kwds)
+        self.maxsize = 2 if is_train else 10    # Set maxsize to large number such val data never get purged.
+        self.is_train = is_train
+        if sas_token_path:
+            self.sas_token = BlobStorage.read_sas_token(sas_token_path)
+            self.base_url = self.sas_token[:self.sas_token.index("?")]
+            self.query_string = self.sas_token[self.sas_token.index("?"):]
+            self.container = BlobStorage.extract_container(self.sas_token)
+        else:
+            self.sas_token = None
+            self.base_url = None
+            self.query_string = None
+            self.container = None
+        logger.debug(
+            f"=> [BlobStorage] Base url: {self.base_url}"
+            f"=> [BlobStorage] Query string: {self.query_string}"
+            f"=> [BlobStorage] Container name: {self.container}"
+        )
+        self.azcopy_path = azcopy_path if azcopy_path else DEFAULT_AZCOPY_PATH
+        self._cached_files = LRU(3)
+    def __getitem__(self, key):
+        value = super().__getitem__(key)
+        self.move_to_end(key)
+        return value
+    def __setitem__(self, key, value):
+        if key in self:
+            self.move_to_end(key)
+        super().__setitem__(key, value)
+        # NOTE: purge the least recently used data if the disk usage is high.
+        # ITP restarts GPU clusters when disk usage reaches 80%.
+        if len(self) > self.maxsize:
+            oldest = next(iter(self))
+            del self[oldest]
+    @staticmethod
+    def read_sas_token(path: str) -> str:
+        with open(path, 'r') as f:
+            token = f.readline().strip()
+        return token
+    @staticmethod
+    def extract_container(token: str) -> str:
+        """
+        Input argument:
+            token (str): the full URI of Shared Access Signature (SAS) in the following format.
+            https://[storage_account].blob.core.windows.net/[container_name][SAS_token]
+        """
+        return os.path.basename(token.split('?')[0])
+    def _convert_to_blob_url(self, local_path: str):
+        return self.base_url + local_path.split("azcopy")[1] + self.query_string
+    def _convert_to_blob_folder_url(self, local_path: str):
+        return self.base_url + local_path.split("azcopy")[1] + "/*" + self.query_string
+    def fetch_blob(self, local_path: str) -> None:
+        if op.exists(local_path):
+            logger.info('=> Try to open {}'.format(local_path))
+            fp = open(local_path, 'r')
+            self._cached_files[local_path] = fp
+            logger.debug("=> %s downloaded. Skip." % local_path)
+            return
+        blob_url = self._convert_to_blob_url(local_path)
+        rank = '0' if 'RANK' not in os.environ else os.environ['RANK']
+        cmd = [self.azcopy_path, "copy", blob_url, local_path + rank]
+        curr_usage = disk_usage('/')
+        logger.info(
+            "=> Downloading %s with azcopy ... (disk usage: %.2f%%)"
+            % (local_path, curr_usage * 100)
+        )
+        proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+        while not is_download_successful(proc.stdout.decode()):
+            logger.info("=> Azcopy failed to download {}. Retrying ...".format(blob_url))
+            proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+        if not op.exists(local_path):
+            os.rename(local_path + rank, local_path)
+        else:
+            os.remove(local_path + rank)
+        logger.info(
+            "=> Downloaded %s with azcopy ... (disk usage: %.2f%% => %.2f%%)" %
+            (local_path, curr_usage * 100, disk_usage('/') * 100)
+        )
+    def fetch_blob_folder(self, local_path: str, azcopy_args: list=[]) -> None:
+        blob_url = self._convert_to_blob_folder_url(local_path)
+        cmd = [self.azcopy_path, "copy", blob_url, local_path] + azcopy_args
+        curr_usage = disk_usage('/')
+        logger.info(
+            "=> Downloading %s with azcopy args %s ... (disk usage: %.2f%%)"
+            % (local_path, ' '.join(azcopy_args), curr_usage * 100)
+        )
+        proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+        while not is_download_successful(proc.stdout.decode()):
+            logger.info("=> Azcopy failed to download {} with args {}. Retrying ...".format(blob_url, ' '.join(azcopy_args)))
+            proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+        logger.info(
+            "=> Downloaded %s with azcopy args %s ... (disk usage: %.2f%% => %.2f%%)" %
+            (local_path, ' '.join(azcopy_args), curr_usage * 100, disk_usage('/') * 100)
+        )
+    def register_local_tsv_paths(self, local_paths: List[str]) -> List[str]:
+        if self.sas_token:
+            tsv_paths_new = []
+            lineidx_paths = set()
+            linelist_paths = set()
+            for path in local_paths:
+                tsv_path_az = path.replace(self.container, 'azcopy')
+                tsv_paths_new.append(tsv_path_az)
+                logger.debug("=> Registering {}".format(tsv_path_az))
+                if not self.is_train:
+                    logger.info('=> Downloading {}...'.format(tsv_path_az))
+                    self.fetch_blob(tsv_path_az)
+                    logger.info('=> Downloaded {}'.format(tsv_path_az))
+                lineidx = op.splitext(path)[0] + '.lineidx'
+                lineidx_ = lineidx.replace(self.container, 'azcopy')
+                if self.is_train:
+                    if not op.isfile(lineidx_) and op.dirname(lineidx_) not in lineidx_paths:
+                        lineidx_paths.add(op.dirname(lineidx_))
+                else:
+                    if not op.isfile(lineidx_):
+                        ensure_directory(op.dirname(lineidx_))
+                        self.fetch_blob(lineidx_)
+                linelist = op.splitext(path)[0] + '.linelist'
+                linelist_ = linelist.replace(self.container, 'azcopy')
+                # .linelist does not always exist. Check existence before fetch
+                if self.is_train:
+                    if op.isfile(linelist) and not op.isfile(linelist_) and op.dirname(linelist_) not in linelist_paths:
+                        linelist_paths.add(op.dirname(linelist_))
+                else:
+                    if op.isfile(linelist) and not op.isfile(linelist_):
+                        ensure_directory(op.dirname(linelist_))
+                        self.fetch_blob(linelist_)
+            if self.is_train:
+                for path in lineidx_paths:
+                    self.fetch_blob_folder(path, azcopy_args=['--include-pattern', '*.lineidx'])
+                for path in linelist_paths:
+                    self.fetch_blob_folder(path, azcopy_args=['--include-pattern', '*.linelist'])
+            return tsv_paths_new
+        else:
+            return local_paths
+    def open(self, local_path: str):
+        if self.sas_token and 'azcopy' in local_path:
+            while not op.exists(local_path):
+                time.sleep(1)
+        fid = open(local_path, 'r')
+        return fid

MedImageInsight/ImageDataLoader/build.py ADDED Viewed

	@@ -0,0 +1,260 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import logging
+import os
+import json
+import pathlib
+from os.path import basename
+from timm.data import create_loader
+import torch
+import torch.utils.data
+import torch.distributed as dist
+import torchvision.datasets as datasets
+from torchvision.io import read_image
+import torch.distributed as dist
+from pathlib import Path
+from yacs.config import CfgNode as CN
+from ..LangEncoder import build_tokenizer
+from .tsv import TSVImageTextDatasetV2
+from .tsv import TSVMeta
+from .transforms import build_transforms
+logger = logging.getLogger(__name__)
+def build_dataset(cfg, is_train):
+    if cfg['DATASET']['DATASET'] == 'image_text_pairs_v2':
+        dataset = _build_pairs_dataset_v2(cfg, is_train)
+    else:
+        raise ValueError(f'Unknown dataset: {cfg["DATASET"]["DATASET"]}')
+    return dataset
+def _get_tsv_list(cfg, is_train):
+    tmp_list = []
+    if is_train and 'TRAIN_TSV_LIST' in cfg['DATASET']:
+        tmp_list = cfg['DATASET']['TRAIN_TSV_LIST']
+    elif 'TEST_TSV_LIST' in cfg['DATASET']:
+        tmp_list = cfg['DATASET']['TEST_TSV_LIST']
+    tsv_list = []
+    for l in tmp_list:
+        if l.endswith('.list'):
+            with open(l, 'r') as f:
+                tsv_list.extend([i.strip() for i in f])
+        else:
+            tsv_list.append(l)
+    logger.info(f'tsv list: {tsv_list}')
+    return tsv_list
+def _get_token_file(cfg):
+    num_nodes = dist.get_world_size() // torch.cuda.device_count()
+    if isinstance(cfg['DATASET']['TOKEN_FILE'], list):
+        if num_nodes == 1:
+            logger.warning('=> Multi token files are provided, but only one node is used for training')
+            sas_token_file = cfg['DATASET']['TOKEN_FILE'][0]
+        else:
+            rank = dist.get_rank()
+            node_idx = rank // torch.cuda.device_count()
+            num_token_files = len(cfg['DATASET']['TOKEN_FILE'])
+            sas_token_file = cfg['DATASET']['TOKEN_FILE'][node_idx % num_token_files]
+    else:
+        sas_token_file = cfg['DATASET']['TOKEN_FILE']
+    sas_token_file = os.path.join(cfg['DATASET']['ROOT'], sas_token_file)
+    if (
+            cfg['DATASET']['LOADER'] == 'blobfuse'
+            or not os.path.isfile(sas_token_file)
+    ):
+        sas_token_file = None
+    return sas_token_file
+def _build_pairs_dataset_v2(cfg, is_train):
+    transforms = build_transforms(cfg, is_train)
+    logger.info('transforms: {}'.format(transforms))
+    dataset_name = cfg['DATASET']['TRAIN_SET'] \
+        if is_train else cfg['DATASET']['TEST_SET']
+    tokenobj = build_tokenizer(cfg['LANG_ENCODER'])
+    if cfg['DATASET']['DATA_FORMAT'] != 'tsv':
+        raise ValueError('Only support tsv format for pairs dataset v2')
+    tsv_list = _get_tsv_list(cfg, is_train)
+    if len(tsv_list) > 0:
+        tsv_filenames = sorted(
+            [
+                os.path.join(cfg['DATASET']['ROOT'], dataset_name, f)
+                for f in tsv_list
+            ]
+        )
+    else:
+        dataset_path = os.path.join(cfg['DATASET']['ROOT'], dataset_name)
+        tsv_files = Path(dataset_path).glob('**/*.tsv')
+        tsv_filenames = sorted(
+            [
+                str(path)
+                for path in tsv_files
+            ]
+        )
+    image_tsv_files = [
+        filename
+        for filename in tsv_filenames
+        if (
+                'image-' in basename(filename)
+                or 'image_' in basename(filename)
+                or '_image' in basename(filename)
+                or '-image' in basename(filename)
+                or 'images-' in basename(filename)
+        )
+    ]
+    text_tsv_files = [
+        filename
+        for filename in tsv_filenames
+        if (
+                'text-' in basename(filename)
+                or 'text_' in basename(filename)
+                or '_text' in basename(filename)
+                or '-text' in basename(filename)
+                or 'texts-' in basename(filename)
+        )
+    ]
+    logger.info(
+        "=> found %d/%d tsv file(s) to load.",
+        len(image_tsv_files), len(text_tsv_files)
+    )
+    num_captions = 1 \
+        if is_train else cfg['DATASET'].get('NUM_CAPTIONS', 1)
+    text_format = cfg['DATASET'].get('TEXT_FORMAT', 'json')
+    sas_token_file = _get_token_file(cfg)
+    logger.info("=> SAS token path: %s", sas_token_file)
+    metas = []
+    cfg_data = cfg['DATASET']
+    if 'CLASSIFICATION_SETS' in cfg_data and 'NUM_CLASSES' in cfg_data:
+        for source, num_classes in zip(cfg_data['CLASSIFICATION_SETS'], cfg_data['NUM_CLASSES']):
+            metas.append(
+                TSVMeta(
+                    source=source,
+                    num_classes=num_classes,
+                    task='classification'
+                )
+            )
+            logger.info('=> add meta: {}'.format(metas[-1]))
+    if 'coco-caption' in dataset_name:
+        logger.info('=> coco caption data is used')
+        logger.info('=> update num_captions: 5, text_format: json')
+        logger.warning('=> set sas token to None for coco evaluation')
+        sas_token_file = None
+        num_captions = 5
+        text_format = 'json'
+    dataset = TSVImageTextDatasetV2(
+        image_tsv_files, text_tsv_files,
+        transform=transforms,
+        tokenize=tokenobj,
+        context_length=cfg['LANG_ENCODER']['CONTEXT_LENGTH'],
+        num_captions=num_captions,
+        text_format=text_format,
+        is_train=is_train,
+        sas_token_path=sas_token_file,
+        metas=metas,
+        prompt_engineering=cfg['DATASET'].get('PROMPT_ENGINEERING', True),
+        concat_queries=cfg['DATASET'].get('CONCAT_QUERIES', False)
+    )
+    logger.info(
+        "=> %s set size: %d", 'train'
+        if is_train else 'val', len(dataset)
+    )
+    return dataset
+def build_dataloader(cfg, is_train=True, distributed=False):
+    dataset = build_dataset(cfg, is_train)
+    if (
+            is_train
+            and 'TIMM_AUG' in cfg['AUG']
+            and cfg['AUG']['TIMM_AUG']['USE_LOADER']
+    ):
+        logger.info('=> use timm loader for training')
+        timm_cfg = CN(init_dict=cfg['AUG']['TIMM_AUG'])
+        data_loader = create_loader(
+            dataset,
+            input_size=cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0],
+            batch_size=cfg['TRAIN']['BATCH_SIZE_PER_GPU'],
+            is_training=True,
+            use_prefetcher=True,
+            no_aug=False,
+            re_prob=timm_cfg.RE_PROB,
+            re_mode=timm_cfg.RE_MODE,
+            re_count=timm_cfg.RE_COUNT,
+            re_split=timm_cfg.RE_SPLIT,
+            scale=cfg['AUG']['SCALE'],
+            ratio=cfg['AUG']['RATIO'],
+            hflip=timm_cfg.HFLIP,
+            vflip=timm_cfg.VFLIP,
+            color_jitter=timm_cfg.COLOR_JITTER,
+            auto_augment=timm_cfg.AUTO_AUGMENT,
+            num_aug_splits=0,
+            interpolation=cfg['AUG']['INTERPOLATION'],
+            mean=cfg['IMAGE_ENCODER']['IMAGE_MEAN'],
+            std=cfg['IMAGE_ENCODER']['IMAGE_STD'],
+            num_workers=cfg['WORKERS'],
+            distributed=distributed,
+            collate_fn=None,
+            pin_memory=cfg['PIN_MEMORY'],
+            use_multi_epochs_loader=True
+        )
+    else:
+        if is_train:
+            batch_size_per_gpu = cfg['TRAIN']['BATCH_SIZE_PER_GPU']
+            shuffle = cfg['TRAIN'].get('SHUFFLE', True)
+        else:
+            batch_size_per_gpu = cfg['TEST']['BATCH_SIZE_PER_GPU']
+            shuffle = cfg['TEST'].get('SHUFFLE', False)
+        if distributed or cfg.get('ALWAYS_ENABLE_SAMPLER', False):
+            # sampler = build_sampler(cfg, dataset, is_train, shuffle)
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=shuffle)
+            shuffle = False
+        else:
+            sampler = None
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size_per_gpu,
+            shuffle=shuffle,
+            num_workers=cfg['WORKERS'],
+            pin_memory=cfg['PIN_MEMORY'],
+            sampler=sampler,
+            drop_last=True if is_train else False,
+            prefetch_factor=cfg.get('PREFETCH_FACTOR', 2)
+        )
+    return data_loader

MedImageInsight/ImageDataLoader/constants.py ADDED Viewed

	@@ -0,0 +1,85 @@

+IMAGENET_CLASSES = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray", "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper", "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander", "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog", "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", "green iguana", "Carolina anole", "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard", "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile", "American alligator", "triceratops", "worm snake", "ring-necked snake", "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra", "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake", "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider", "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl", "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck", "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot", "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel", "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle", "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound", "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound", "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound", "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier", "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier", "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier", "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier", "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer", "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier", "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier", "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever", "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla", "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel", "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel", "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard", "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie", "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann", "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog", "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff", "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky", "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog", "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon", "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle", "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf", "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox", "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat", "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly", "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly", "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse", "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)", "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque", "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin", "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey", "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda", "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish", "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown", "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle", "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo", "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel", "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)", "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini", "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet", "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra", "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe", "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton", "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw", "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking", "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker", "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard", "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table", "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder", "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola", "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine", "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer", "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar", "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep", "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion", "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag", "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone", "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor", "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa", "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail", "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart", "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush", "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench", "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case", "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube", "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag", "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug", "printer", "prison", "projectile", "projector", "hockey puck", "punching bag", "purse", "quill", "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel", "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator", "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser", "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal", "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard", "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store", "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door", "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater", "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight", "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf", "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa", "submarine", "suit", "sundial", "sunglasses", "dark glasses", "sunscreen", "suspension bridge", "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball", "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof", "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store", "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling", "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink", "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing", "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website", "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu", "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette", "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli", "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate", "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito", "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef", "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player", "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom", "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"]
+IMAGENET_DEFAULT_TEMPLATES = [
+    '{}.',
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+]

MedImageInsight/ImageDataLoader/languages/__init__.py ADDED Viewed

File without changes

MedImageInsight/ImageDataLoader/languages/prompt_engineering.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import numpy as np
+import random
+def get_prompt_templates():
+    prompt_templates = [
+        '{}.',
+        'a photo of a {}.',
+        'a bad photo of a {}.',
+        'a photo of many {}.',
+        'a sculpture of a {}.',
+        'a photo of the hard to see {}.',
+        'a low resolution photo of the {}.',
+        'a rendering of a {}.',
+        'graffiti of a {}.',
+        'a bad photo of the {}.',
+        'a cropped photo of the {}.',
+        'a tattoo of a {}.',
+        'the embroidered {}.',
+        'a photo of a hard to see {}.',
+        'a bright photo of a {}.',
+        'a photo of a clean {}.',
+        'a photo of a dirty {}.',
+        'a dark photo of the {}.',
+        'a drawing of a {}.',
+        'a photo of my {}.',
+        'the plastic {}.',
+        'a photo of the cool {}.',
+        'a close-up photo of a {}.',
+        'a black and white photo of the {}.',
+        'a painting of the {}.',
+        'a painting of a {}.',
+        'a pixelated photo of the {}.',
+        'a sculpture of the {}.',
+        'a bright photo of the {}.',
+        'a cropped photo of a {}.',
+        'a plastic {}.',
+        'a photo of the dirty {}.',
+        'a jpeg corrupted photo of a {}.',
+        'a blurry photo of the {}.',
+        'a photo of the {}.',
+        'a good photo of the {}.',
+        'a rendering of the {}.',
+        'a {} in a video game.',
+        'a photo of one {}.',
+        'a doodle of a {}.',
+        'a close-up photo of the {}.',
+        'the origami {}.',
+        'the {} in a video game.',
+        'a sketch of a {}.',
+        'a doodle of the {}.',
+        'a origami {}.',
+        'a low resolution photo of a {}.',
+        'the toy {}.',
+        'a rendition of the {}.',
+        'a photo of the clean {}.',
+        'a photo of a large {}.',
+        'a rendition of a {}.',
+        'a photo of a nice {}.',
+        'a photo of a weird {}.',
+        'a blurry photo of a {}.',
+        'a cartoon {}.',
+        'art of a {}.',
+        'a sketch of the {}.',
+        'a embroidered {}.',
+        'a pixelated photo of a {}.',
+        'itap of the {}.',
+        'a jpeg corrupted photo of the {}.',
+        'a good photo of a {}.',
+        'a plushie {}.',
+        'a photo of the nice {}.',
+        'a photo of the small {}.',
+        'a photo of the weird {}.',
+        'the cartoon {}.',
+        'art of the {}.',
+        'a drawing of the {}.',
+        'a photo of the large {}.',
+        'a black and white photo of a {}.',
+        'the plushie {}.',
+        'a dark photo of a {}.',
+        'itap of a {}.',
+        'graffiti of the {}.',
+        'a toy {}.',
+        'itap of my {}.',
+        'a photo of a cool {}.',
+        'a photo of a small {}.',
+        'a tattoo of the {}.',
+    ]
+    return prompt_templates
+def prompt_engineering(classnames):
+    prompt_templates = get_prompt_templates()
+    temp_idx = np.random.randint(len(prompt_templates))
+    if isinstance(classnames, list):
+        classname = random.choice(classnames)
+    else:
+        classname = classnames
+    return prompt_templates[temp_idx].replace('{}', classname.replace(',', '').replace('+', ' '))

MedImageInsight/ImageDataLoader/transforms/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build import build_transforms

MedImageInsight/ImageDataLoader/transforms/autoaugment.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import math
+from enum import Enum
+from typing import List, Tuple, Optional, Dict
+import torch
+from torch import Tensor
+from torchvision.transforms import functional as F
+from torchvision.transforms.functional import InterpolationMode
+__all__ = ["AutoAugmentPolicy", "AutoAugment", "RandAugment", "TrivialAugmentWide"]
+def _apply_op(
+    img: Tensor, op_name: str, magnitude: float, interpolation: InterpolationMode, fill: Optional[List[float]]
+):
+    if op_name == "ShearX":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, 0],
+            scale=1.0,
+            shear=[math.degrees(magnitude), 0.0],
+            interpolation=interpolation,
+            fill=fill,
+        )
+    elif op_name == "ShearY":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, 0],
+            scale=1.0,
+            shear=[0.0, math.degrees(magnitude)],
+            interpolation=interpolation,
+            fill=fill,
+        )
+    elif op_name == "TranslateX":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[int(magnitude), 0],
+            scale=1.0,
+            interpolation=interpolation,
+            shear=[0.0, 0.0],
+            fill=fill,
+        )
+    elif op_name == "TranslateY":
+        img = F.affine(
+            img,
+            angle=0.0,
+            translate=[0, int(magnitude)],
+            scale=1.0,
+            interpolation=interpolation,
+            shear=[0.0, 0.0],
+            fill=fill,
+        )
+    elif op_name == "Rotate":
+        img = F.rotate(img, magnitude, interpolation=interpolation, fill=fill)
+    elif op_name == "Brightness":
+        img = F.adjust_brightness(img, 1.0 + magnitude)
+    elif op_name == "Color":
+        img = F.adjust_saturation(img, 1.0 + magnitude)
+    elif op_name == "Contrast":
+        img = F.adjust_contrast(img, 1.0 + magnitude)
+    elif op_name == "Sharpness":
+        img = F.adjust_sharpness(img, 1.0 + magnitude)
+    elif op_name == "Posterize":
+        img = F.posterize(img, int(magnitude))
+    elif op_name == "Solarize":
+        img = F.solarize(img, magnitude)
+    elif op_name == "AutoContrast":
+        img = F.autocontrast(img)
+    elif op_name == "Equalize":
+        img = F.equalize(img)
+    elif op_name == "Invert":
+        img = F.invert(img)
+    elif op_name == "Identity":
+        pass
+    else:
+        raise ValueError(f"The provided operator {op_name} is not recognized.")
+    return img
+class AutoAugmentPolicy(Enum):
+    """AutoAugment policies learned on different datasets.
+    Available policies are IMAGENET, CIFAR10 and SVHN.
+    """
+    IMAGENET = "imagenet"
+    CIFAR10 = "cifar10"
+    SVHN = "svhn"
+# FIXME: Eliminate copy-pasted code for fill standardization and _augmentation_space() by moving stuff on a base class
+class AutoAugment(torch.nn.Module):
+    r"""AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+    Args:
+        policy (AutoAugmentPolicy): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+    def __init__(
+        self,
+        policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.policy = policy
+        self.interpolation = interpolation
+        self.fill = fill
+        self.policies = self._get_policies(policy)
+    def _get_policies(
+        self, policy: AutoAugmentPolicy
+    ) -> List[Tuple[Tuple[str, float, Optional[int]], Tuple[str, float, Optional[int]]]]:
+        if policy == AutoAugmentPolicy.IMAGENET:
+            return [
+                (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+                (("Posterize", 0.6, 7), ("Posterize", 0.6, 6)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Equalize", 0.4, None), ("Rotate", 0.8, 8)),
+                (("Solarize", 0.6, 3), ("Equalize", 0.6, None)),
+                (("Posterize", 0.8, 5), ("Equalize", 1.0, None)),
+                (("Rotate", 0.2, 3), ("Solarize", 0.6, 8)),
+                (("Equalize", 0.6, None), ("Posterize", 0.4, 6)),
+                (("Rotate", 0.8, 8), ("Color", 0.4, 0)),
+                (("Rotate", 0.4, 9), ("Equalize", 0.6, None)),
+                (("Equalize", 0.0, None), ("Equalize", 0.8, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Rotate", 0.8, 8), ("Color", 1.0, 2)),
+                (("Color", 0.8, 8), ("Solarize", 0.8, 7)),
+                (("Sharpness", 0.4, 7), ("Invert", 0.6, None)),
+                (("ShearX", 0.6, 5), ("Equalize", 1.0, None)),
+                (("Color", 0.4, 0), ("Equalize", 0.6, None)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+            ]
+        elif policy == AutoAugmentPolicy.CIFAR10:
+            return [
+                (("Invert", 0.1, None), ("Contrast", 0.2, 6)),
+                (("Rotate", 0.7, 2), ("TranslateX", 0.3, 9)),
+                (("Sharpness", 0.8, 1), ("Sharpness", 0.9, 3)),
+                (("ShearY", 0.5, 8), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.5, None), ("Equalize", 0.9, None)),
+                (("ShearY", 0.2, 7), ("Posterize", 0.3, 7)),
+                (("Color", 0.4, 3), ("Brightness", 0.6, 7)),
+                (("Sharpness", 0.3, 9), ("Brightness", 0.7, 9)),
+                (("Equalize", 0.6, None), ("Equalize", 0.5, None)),
+                (("Contrast", 0.6, 7), ("Sharpness", 0.6, 5)),
+                (("Color", 0.7, 7), ("TranslateX", 0.5, 8)),
+                (("Equalize", 0.3, None), ("AutoContrast", 0.4, None)),
+                (("TranslateY", 0.4, 3), ("Sharpness", 0.2, 6)),
+                (("Brightness", 0.9, 6), ("Color", 0.2, 8)),
+                (("Solarize", 0.5, 2), ("Invert", 0.0, None)),
+                (("Equalize", 0.2, None), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.2, None), ("Equalize", 0.6, None)),
+                (("Color", 0.9, 9), ("Equalize", 0.6, None)),
+                (("AutoContrast", 0.8, None), ("Solarize", 0.2, 8)),
+                (("Brightness", 0.1, 3), ("Color", 0.7, 0)),
+                (("Solarize", 0.4, 5), ("AutoContrast", 0.9, None)),
+                (("TranslateY", 0.9, 9), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.9, None), ("Solarize", 0.8, 3)),
+                (("Equalize", 0.8, None), ("Invert", 0.1, None)),
+                (("TranslateY", 0.7, 9), ("AutoContrast", 0.9, None)),
+            ]
+        elif policy == AutoAugmentPolicy.SVHN:
+            return [
+                (("ShearX", 0.9, 4), ("Invert", 0.2, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.6, None), ("Solarize", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("AutoContrast", 0.8, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.4, None)),
+                (("ShearY", 0.9, 5), ("Solarize", 0.2, 6)),
+                (("Invert", 0.9, None), ("AutoContrast", 0.8, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("Solarize", 0.3, 3)),
+                (("ShearY", 0.8, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.9, None), ("TranslateY", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Contrast", 0.3, 3), ("Rotate", 0.8, 4)),
+                (("Invert", 0.8, None), ("TranslateY", 0.0, 2)),
+                (("ShearY", 0.7, 6), ("Solarize", 0.4, 8)),
+                (("Invert", 0.6, None), ("Rotate", 0.8, 4)),
+                (("ShearY", 0.3, 7), ("TranslateX", 0.9, 3)),
+                (("ShearX", 0.1, 6), ("Invert", 0.6, None)),
+                (("Solarize", 0.7, 2), ("TranslateY", 0.6, 7)),
+                (("ShearY", 0.8, 4), ("Invert", 0.8, None)),
+                (("ShearX", 0.7, 9), ("TranslateY", 0.8, 3)),
+                (("ShearY", 0.8, 5), ("AutoContrast", 0.7, None)),
+                (("ShearX", 0.7, 2), ("Invert", 0.1, None)),
+            ]
+        else:
+            raise ValueError(f"The provided policy {policy} is not recognized.")
+    def _augmentation_space(self, num_bins: int, image_size: List[int]) -> Dict[str, Tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.3, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True),
+            "Rotate": (torch.linspace(0.0, 30.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+            "Invert": (torch.tensor(0.0), False),
+        }
+    @staticmethod
+    def get_params(transform_num: int) -> Tuple[int, Tensor, Tensor]:
+        """Get parameters for autoaugment transformation
+        Returns:
+            params required by the autoaugment transformation
+        """
+        policy_id = int(torch.randint(transform_num, (1,)).item())
+        probs = torch.rand((2,))
+        signs = torch.randint(2, (2,))
+        return policy_id, probs, signs
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+        Returns:
+            PIL Image or Tensor: AutoAugmented image.
+        """
+        fill = self.fill
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * F.get_image_num_channels(img)
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+        transform_id, probs, signs = self.get_params(len(self.policies))
+        for i, (op_name, p, magnitude_id) in enumerate(self.policies[transform_id]):
+            if probs[i] <= p:
+                op_meta = self._augmentation_space(10, F.get_image_size(img))
+                magnitudes, signed = op_meta[op_name]
+                magnitude = float(magnitudes[magnitude_id].item()) if magnitude_id is not None else 0.0
+                if signed and signs[i] == 0:
+                    magnitude *= -1.0
+                img = _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+        return img
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + f"(policy={self.policy}, fill={self.fill})"
+class RandAugment(torch.nn.Module):
+    r"""RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+    Args:
+        num_ops (int): Number of augmentation transformations to apply sequentially.
+        magnitude (int): Magnitude for all the transformations.
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+    def __init__(
+        self,
+        num_ops: int = 2,
+        magnitude: int = 9,
+        num_magnitude_bins: int = 31,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_ops = num_ops
+        self.magnitude = magnitude
+        self.num_magnitude_bins = num_magnitude_bins
+        self.interpolation = interpolation
+        self.fill = fill
+    def _augmentation_space(self, num_bins: int, image_size: List[int]) -> Dict[str, Tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "Identity": (torch.tensor(0.0), False),
+            "ShearX": (torch.linspace(0.0, 0.3, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.3, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True),
+            "Rotate": (torch.linspace(0.0, 30.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.9, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+        }
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        fill = self.fill
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * F.get_image_num_channels(img)
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+        for _ in range(self.num_ops):
+            op_meta = self._augmentation_space(self.num_magnitude_bins, F.get_image_size(img))
+            op_index = int(torch.randint(len(op_meta), (1,)).item())
+            op_name = list(op_meta.keys())[op_index]
+            magnitudes, signed = op_meta[op_name]
+            magnitude = float(magnitudes[self.magnitude].item()) if magnitudes.ndim > 0 else 0.0
+            if signed and torch.randint(2, (1,)):
+                magnitude *= -1.0
+            img = _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+        return img
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_ops={num_ops}"
+        s += ", magnitude={magnitude}"
+        s += ", num_magnitude_bins={num_magnitude_bins}"
+        s += ", interpolation={interpolation}"
+        s += ", fill={fill}"
+        s += ")"
+        return s.format(**self.__dict__)
+class TrivialAugmentWide(torch.nn.Module):
+    r"""Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+    Args:
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+    def __init__(
+        self,
+        num_magnitude_bins: int = 31,
+        interpolation: InterpolationMode = InterpolationMode.NEAREST,
+        fill: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.num_magnitude_bins = num_magnitude_bins
+        self.interpolation = interpolation
+        self.fill = fill
+    def _augmentation_space(self, num_bins: int) -> Dict[str, Tuple[Tensor, bool]]:
+        return {
+            # op_name: (magnitudes, signed)
+            "Identity": (torch.tensor(0.0), False),
+            "ShearX": (torch.linspace(0.0, 0.99, num_bins), True),
+            "ShearY": (torch.linspace(0.0, 0.99, num_bins), True),
+            "TranslateX": (torch.linspace(0.0, 32.0, num_bins), True),
+            "TranslateY": (torch.linspace(0.0, 32.0, num_bins), True),
+            "Rotate": (torch.linspace(0.0, 135.0, num_bins), True),
+            "Brightness": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Color": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Contrast": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Sharpness": (torch.linspace(0.0, 0.99, num_bins), True),
+            "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)).round().int(), False),
+            "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),
+            "AutoContrast": (torch.tensor(0.0), False),
+            "Equalize": (torch.tensor(0.0), False),
+        }
+    def forward(self, img: Tensor) -> Tensor:
+        """
+            img (PIL Image or Tensor): Image to be transformed.
+        Returns:
+            PIL Image or Tensor: Transformed image.
+        """
+        fill = self.fill
+        if isinstance(img, Tensor):
+            if isinstance(fill, (int, float)):
+                fill = [float(fill)] * F.get_image_num_channels(img)
+            elif fill is not None:
+                fill = [float(f) for f in fill]
+        op_meta = self._augmentation_space(self.num_magnitude_bins)
+        op_index = int(torch.randint(len(op_meta), (1,)).item())
+        op_name = list(op_meta.keys())[op_index]
+        magnitudes, signed = op_meta[op_name]
+        magnitude = (
+            float(magnitudes[torch.randint(len(magnitudes), (1,), dtype=torch.long)].item())
+            if magnitudes.ndim > 0
+            else 0.0
+        )
+        if signed and torch.randint(2, (1,)):
+            magnitude *= -1.0
+        return _apply_op(img, op_name, magnitude, interpolation=self.interpolation, fill=fill)
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_magnitude_bins={num_magnitude_bins}"
+        s += ", interpolation={interpolation}"
+        s += ", fill={fill}"
+        s += ")"
+        return s.format(**self.__dict__)

MedImageInsight/ImageDataLoader/transforms/build.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import timm
+from timm.data import create_transform
+from yacs.config import CfgNode as CN
+from PIL import ImageFilter
+import logging
+import random
+import torch
+import torchvision.transforms as T
+from .autoaugment import AutoAugmentPolicy
+from .autoaugment import AutoAugment
+from .autoaugment import RandAugment
+from .autoaugment import TrivialAugmentWide
+from .threeaugment import deitIII_Solarization
+from .threeaugment import deitIII_gray_scale
+from .threeaugment import deitIII_GaussianBlur
+from PIL import ImageOps
+from timm.data.transforms import RandomResizedCropAndInterpolation
+logger = logging.getLogger(__name__)
+class GaussianBlur(object):
+    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
+    def __init__(self, sigma=[.1, 2.]):
+        self.sigma = sigma
+    def __call__(self, x):
+        sigma = random.uniform(self.sigma[0], self.sigma[1])
+        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+        return x
+def get_resolution(original_resolution):
+    """Takes (H,W) and returns (precrop, crop)."""
+    area = original_resolution[0] * original_resolution[1]
+    return (160, 128) if area < 96*96 else (512, 480)
+INTERPOLATION_MODES = {
+    'bilinear': T.InterpolationMode.BILINEAR,
+    'bicubic': T.InterpolationMode.BICUBIC,
+    'nearest': T.InterpolationMode.NEAREST,
+}
+def build_transforms(cfg, is_train=True):
+    # assert isinstance(cfg.DATASET.OUTPUT_SIZE, (list, tuple)), 'DATASET.OUTPUT_SIZE should be list or tuple'
+    normalize = T.Normalize(
+        mean=cfg['IMAGE_ENCODER']['IMAGE_MEAN'],
+        std=cfg['IMAGE_ENCODER']['IMAGE_STD']
+    )
+    transforms = None
+    if is_train:
+        if 'THREE_AUG' in cfg['AUG']:
+            img_size = cfg['IMAGE_ENCODER']['IMAGE_SIZE']
+            remove_random_resized_crop = cfg['AUG']['THREE_AUG']['SRC']
+            mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
+            primary_tfl = []
+            scale=(0.08, 1.0)
+            interpolation='bicubic'
+            if remove_random_resized_crop:
+                primary_tfl = [
+                    T.Resize(img_size, interpolation=3),
+                    T.RandomCrop(img_size, padding=4,padding_mode='reflect'),
+                    T.RandomHorizontalFlip()
+                ]
+            else:
+                primary_tfl = [
+                    RandomResizedCropAndInterpolation(
+                        img_size, scale=scale, interpolation=interpolation),
+                    T.RandomHorizontalFlip()
+                ]
+            secondary_tfl = [T.RandomChoice([gray_scale(p=1.0),
+                                             Solarization(p=1.0),
+                                             GaussianBlurDeiTv3(p=1.0)])]
+            color_jitter = cfg['AUG']['THREE_AUG']['COLOR_JITTER']
+            if color_jitter is not None and not color_jitter==0:
+                secondary_tfl.append(T.ColorJitter(color_jitter, color_jitter, color_jitter))
+            final_tfl = [
+                    T.ToTensor(),
+                    T.Normalize(
+                        mean=torch.tensor(mean),
+                        std=torch.tensor(std))
+                ]
+            return T.Compose(primary_tfl+secondary_tfl+final_tfl)
+        elif 'TIMM_AUG' in cfg['AUG'] and cfg['AUG']['TIMM_AUG']['USE_TRANSFORM']:
+            logger.info('=> use timm transform for training')
+            timm_cfg = cfg['AUG']['TIMM_AUG']
+            transforms = create_transform(
+                input_size=cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0],
+                is_training=True,
+                use_prefetcher=False,
+                no_aug=False,
+                re_prob=timm_cfg.get('RE_PROB', 0.),
+                re_mode=timm_cfg.get('RE_MODE', 'const'),
+                re_count=timm_cfg.get('RE_COUNT', 1),
+                re_num_splits= 0 if not timm_cfg.get('RE_SPLITS', False) else timm_cfg['RE_SPLITS'], # if false or 0, return 0
+                scale=cfg['AUG'].get('SCALE', None),
+                ratio=cfg['AUG'].get('RATIO', None),
+                hflip=timm_cfg.get('HFLIP', 0.5),
+                vflip=timm_cfg.get('VFLIP', 0.),
+                color_jitter=timm_cfg.get('COLOR_JITTER', 0.4),
+                auto_augment=timm_cfg.get('AUTO_AUGMENT', None),
+                interpolation=cfg['AUG']['INTERPOLATION'],
+                mean=cfg['IMAGE_ENCODER']['IMAGE_MEAN'],
+                std=cfg['IMAGE_ENCODER']['IMAGE_STD'],
+            )
+        elif 'TORCHVISION_AUG' in cfg['AUG']:
+            logger.info('=> use torchvision transform fro training')
+            crop_size = cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0]
+            interpolation = INTERPOLATION_MODES[cfg['AUG']['INTERPOLATION']]
+            trans = [
+                T.RandomResizedCrop(
+                    crop_size, scale=cfg['AUG']['SCALE'], ratio=cfg['AUG']['RATIO'],
+                    interpolation=interpolation
+                )
+            ]
+            hflip_prob = cfg['AUG']['TORCHVISION_AUG']['HFLIP']
+            auto_augment_policy = cfg['AUG']['TORCHVISION_AUG'].get('AUTO_AUGMENT', None)
+            if hflip_prob > 0:
+                trans.append(T.RandomHorizontalFlip(hflip_prob))
+            if auto_augment_policy is not None:
+                if auto_augment_policy == "ra":
+                    trans.append(RandAugment(interpolation=interpolation))
+                elif auto_augment_policy == "ta_wide":
+                    trans.append(TrivialAugmentWide(interpolation=interpolation))
+                else:
+                    aa_policy = AutoAugmentPolicy(auto_augment_policy)
+                    trans.append(AutoAugment(policy=aa_policy, interpolation=interpolation))
+            trans.extend(
+                [
+                    T.ToTensor(),
+                    normalize,
+                ]
+            )
+            random_erase_prob = cfg['AUG']['TORCHVISION_AUG']['RE_PROB']
+            random_erase_scale = cfg['AUG']['TORCHVISION_AUG'].get('RE_SCALE', 0.33)
+            if random_erase_prob > 0:
+                # NCFC (4/26/2023): Added scale parameter to random erasing for medical imaging
+                trans.append(T.RandomErasing(p=random_erase_prob, scale = (0.02, random_erase_scale)))
+            from torchvision.transforms import InterpolationMode
+            rotation = cfg['AUG']['TORCHVISION_AUG'].get('ROTATION', 0.0)
+            if (rotation > 0.0):
+                trans.append(T.RandomRotation(rotation, interpolation=InterpolationMode.BILINEAR))
+                logger.info(" TORCH AUG: Rotation: " + str(rotation))
+            transforms = T.Compose(trans)
+        elif cfg['AUG'].get('RANDOM_CENTER_CROP', False):
+            logger.info('=> use random center crop data augmenation')
+            # precrop, crop = get_resolution(cfg.TRAIN.IMAGE_SIZE)
+            crop = cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0]
+            padding = cfg['AUG'].get('RANDOM_CENTER_CROP_PADDING', 32)
+            precrop = crop + padding
+            mode = INTERPOLATION_MODES[cfg['AUG']['INTERPOLATION']]
+            transforms = T.Compose([
+                T.Resize(
+                    (precrop, precrop),
+                    interpolation=mode
+                ),
+                T.RandomCrop((crop, crop)),
+                T.RandomHorizontalFlip(),
+                T.ToTensor(),
+                normalize,
+            ])
+        elif cfg['AUG'].get('MAE_FINETUNE_AUG', False):
+            mean = cfg['IMAGE_ENCODER']['IMAGE_MEAN']
+            std = cfg['IMAGE_ENCODER']['IMAGE_STD']
+            transforms = create_transform(
+                input_size=cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0],
+                is_training=True,
+                color_jitter=cfg['AUG'].get('COLOR_JITTER', None),
+                auto_augment=cfg['AUG'].get('AUTO_AUGMENT', 'rand-m9-mstd0.5-inc1'),
+                interpolation='bicubic',
+                re_prob=cfg['AUG'].get('RE_PROB', 0.25),
+                re_mode=cfg['AUG'].get('RE_MODE', "pixel"),
+                re_count=cfg['AUG'].get('RE_COUNT', 1),
+                mean=mean,
+                std=std,
+            )
+        elif cfg['AUG'].get('MAE_PRETRAIN_AUG', False):
+            mean = cfg['IMAGE_ENCODER']['IMAGE_MEAN']
+            std = cfg['IMAGE_ENCODER']['IMAGE_STD']
+            transforms = T.Compose([
+                T.RandomResizedCrop(cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0], scale=tuple(cfg['AUG']['SCALE']), interpolation=INTERPOLATION_MODES["bicubic"]),  # 3 is bicubic
+                T.RandomHorizontalFlip(),
+                T.ToTensor(),
+                T.Normalize(mean=mean, std=std)])
+        elif cfg['AUG'].get('ThreeAugment', False): # from DeiT III
+            mean = cfg['IMAGE_ENCODER']['IMAGE_MEAN']
+            std = cfg['IMAGE_ENCODER']['IMAGE_STD']
+            img_size = cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0]
+            remove_random_resized_crop = cfg['AUG'].get('src', False)
+            mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
+            primary_tfl = []
+            scale=(0.08, 1.0)
+            interpolation='bicubic'
+            if remove_random_resized_crop:
+                primary_tfl = [
+                    T.Resize(img_size, interpolation=3), # bicubic
+                    T.RandomCrop(img_size, padding=4,padding_mode='reflect'),
+                    T.RandomHorizontalFlip()
+                ]
+            else:
+                primary_tfl = [
+                    timm.data.transforms.RandomResizedCropAndInterpolation(
+                        img_size, scale=scale, interpolation=interpolation),
+                    T.RandomHorizontalFlip()
+                ]
+            secondary_tfl = [T.RandomChoice([deitIII_gray_scale(p=1.0),
+                                             deitIII_Solarization(p=1.0),
+                                             deitIII_GaussianBlur(p=1.0)])]
+            color_jitter = cfg['AUG']['COLOR_JITTER']
+            secondary_tfl.append(T.ColorJitter(color_jitter, color_jitter, color_jitter))
+            final_tfl = [
+                    T.ToTensor(),
+                    T.Normalize(
+                        mean=torch.tensor(mean),
+                        std=torch.tensor(std))
+                ]
+            transforms = T.Compose(primary_tfl+secondary_tfl+final_tfl)
+        logger.info('=> training transformers: {}'.format(transforms))
+    else:
+        mode = INTERPOLATION_MODES[cfg['AUG']['INTERPOLATION']]
+        if cfg['TEST']['CENTER_CROP']:
+            transforms = T.Compose([
+                T.Resize(
+                    int(cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0] / 0.875),
+                    # the same behavior as in deit: size = int((256 / 224) * args.input_size)
+                    # 224 / 256 = 0.875
+                    interpolation=mode
+                ),
+                T.CenterCrop(cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0]),
+                T.ToTensor(),
+                normalize,
+            ])
+        else:
+            transforms = T.Compose([
+                T.Resize(
+                    (cfg['IMAGE_ENCODER']['IMAGE_SIZE'][1], cfg['IMAGE_ENCODER']['IMAGE_SIZE'][0]),
+                    interpolation=mode
+                ),
+                T.ToTensor(),
+                normalize,
+            ])
+        logger.info('=> testing transformers: {}'.format(transforms))
+    return transforms

MedImageInsight/ImageDataLoader/transforms/threeaugment.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import random
+from PIL import ImageFilter, ImageOps
+from torchvision import transforms
+class deitIII_GaussianBlur(object):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+    def __init__(self, p=0.1, radius_min=0.1, radius_max=2.):
+        self.prob = p
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+    def __call__(self, img):
+        do_it = random.random() <= self.prob
+        if not do_it:
+            return img
+        img = img.filter(
+            ImageFilter.GaussianBlur(
+                radius=random.uniform(self.radius_min, self.radius_max)
+            )
+        )
+        return img
+class deitIII_Solarization(object):
+    """
+    Apply Solarization to the PIL image.
+    """
+    def __init__(self, p=0.2):
+        self.p = p
+    def __call__(self, img):
+        if random.random() < self.p:
+            return ImageOps.solarize(img)
+        else:
+            return img
+class deitIII_gray_scale(object):
+    """
+    Apply Solarization to the PIL image.
+    """
+    def __init__(self, p=0.2):
+        self.p = p
+        self.transf = transforms.Grayscale(3)
+    def __call__(self, img):
+        if random.random() < self.p:
+            return self.transf(img)
+        else:
+            return img

MedImageInsight/ImageDataLoader/tsv.py ADDED Viewed

	@@ -0,0 +1,351 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from io import BytesIO
+import json
+import logging
+import base64
+import random
+from typing import Callable, List, Tuple, Union, NamedTuple
+from PIL import Image
+from PIL import ImageFile
+import torch.utils.data as data
+from .languages.prompt_engineering import prompt_engineering
+from .tsv_file import TSVFile, CompositeTSVFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+logger = logging.getLogger(__name__)
+class TSVDataset(data.Dataset):
+    def __init__(self,
+                 tsv_file: Union[str, List[str]],
+                 transform: Callable = None,
+                 map_file: str = None,
+                 token_file: str = None,
+                 is_train: bool = True,
+                 azcopy_path: str = None):
+        self.transform = transform
+        self._chunk_sizes = None
+        self.label2idx = self._load_map(map_file)
+        self.class_selector = list(self.label2idx.keys()) if self.label2idx else None
+        if isinstance(tsv_file, str):
+            if os.path.splitext(tsv_file)[1] == '.tsv':
+                self.tsv_file = TSVFile(
+                    tsv_file, class_selector=self.class_selector
+                )
+            else:
+                self.tsv_file = CompositeTSVFile(
+                    tsv_file,
+                    class_selector=self.class_selector,
+                    is_train=is_train,
+                    sas_token_path=token_file,
+                    azcopy_path=azcopy_path
+                )
+                self._chunk_sizes = self.tsv_file.get_chunk_size()
+        elif isinstance(tsv_file, list):
+            self.tsv_file = CompositeTSVFile(
+                tsv_file,
+                class_selector=self.class_selector,
+                is_train=is_train,
+                sas_token_path=token_file,
+                azcopy_path=azcopy_path
+            )
+            self._chunk_sizes = self.tsv_file.get_chunk_size()
+        else:
+            raise ValueError("Invalid input! Please check the tsv filenames")
+        logger.debug('=> {}\titems: {}'.format(tsv_file, len(self.tsv_file)))
+    def fetch_blob(self, idx):
+        image_tsv = self.tsv_file.file_list[idx]
+        self.tsv_file.blob_storage.fetch_blob(image_tsv)
+    def num_classes(self):
+        return len(self.class_selector)
+    def get_chunk_sizes(self):
+        return self._chunk_sizes
+    def get_class_boundaries(self):
+        # The samples of each class are organized class-by-class.
+        # _class_boundaries stores the lower- and upper-bound of each class.
+        return self.tsv_file.get_class_boundaries()
+    def get_filenames(self):
+        filenames = [
+            self.tsv_file.get_key(i)
+            for i in range(self.tsv_file.num_rows())
+        ]
+        return filenames
+    def _load_map(self, map_file: str):
+        if not map_file:
+            return None
+        label2idx = {}
+        with open(map_file) as f:
+            for line in f:
+                items = line.strip().split('\t')
+                label2idx[items[0]] = int(items[1])
+        return label2idx
+    def __getitem__(self, index: Union[int, Tuple[int, int]]):
+        items = self.tsv_file[index]
+        _, target, img = self._decode_data(items)
+        if self.transform:
+            img = self.transform(img)
+        return img, target
+    def _decode_data(self, items: Tuple[str, str, str]):
+        key = items[0]
+        label = self._get_label(items[1])
+        image = Image.open(BytesIO(base64.b64decode(items[2]))).convert('RGB')
+        return key, label, image
+    def _get_label(self, item: str):
+        if not self.label2idx:
+            return int(item)
+        js = json.loads(item)
+        return self.label2idx[js[0]['class']]
+    def __len__(self):
+        return len(self.tsv_file)
+class TSVMeta(NamedTuple):
+    source: str
+    num_classes: int
+    task: str
+class TSVImageTextDatasetV2(data.Dataset):
+    """
+        This class is intended for encapsulating Image/Text pair data for contrastive learning described in
+        the following paper,
+        "Learning Transferable Visual Models From Natural Language Supervision" (a.k.a CLIP)
+        V2: support image text pairs and supervised classification data
+    """
+    def __init__(self,
+                 image_tsv_file: Union[str, List[str]],
+                 text_tsv_file: Union[str, List[str]],
+                 transform: Callable = None,
+                 tokenize: Callable = None,
+                 context_length: int = 77,
+                 num_captions: int = 1,
+                 text_format: str = 'txt',
+                 is_train: bool = True,
+                 sas_token_path: str = None,
+                 azcopy_path: str = None,
+                 metas: List[NamedTuple] = None,
+                 prompt_engineering=True,
+                 concat_queries=False):
+        self.transform = transform
+        self.tokenize = tokenize
+        self._chunk_sizes = None
+        self.context_length = context_length
+        self.num_captions = num_captions
+        self.text_format = text_format
+        self.tsv_file_list = []
+        self.metas = metas
+        self.label_offsets = self.build_label_offsets()
+        self.prompt_engineering = prompt_engineering
+        self.concat_queries = concat_queries
+        if isinstance(image_tsv_file, str) and isinstance(text_tsv_file, str):
+            # single tsv file
+            if (
+                    os.path.splitext(image_tsv_file)[1].lower() == '.tsv'
+                    and os.path.splitext(text_tsv_file)[1].lower() == '.tsv'
+            ):
+                self.tsv_file_list.append((image_tsv_file, text_tsv_file))
+                self.image_tsv_file = TSVFile(
+                    image_tsv_file, if_generate_lineidx=True
+                )
+                self.text_tsv_file = TSVFile(
+                    text_tsv_file, if_generate_lineidx=True
+                )
+            else:
+                raise ValueError("Invalid input! Please check the tsv filenames.")
+        # multiple tsv files specified in a list
+        elif (
+                isinstance(image_tsv_file, list)
+                and isinstance(text_tsv_file, list)
+        ):
+            assert len(image_tsv_file) == len(text_tsv_file), \
+                "Inconsistent number of Image/Text tsv files!"
+            self.tsv_file_list = [
+                (txt, img)
+                for img, txt in zip(image_tsv_file, text_tsv_file)
+            ]
+            self.image_tsv_file = CompositeTSVFile(
+                image_tsv_file,
+                is_train=is_train,
+                sas_token_path=sas_token_path,
+                azcopy_path=azcopy_path
+            )
+            self.text_tsv_file = CompositeTSVFile(
+                text_tsv_file,
+                is_train=is_train,
+                sas_token_path=sas_token_path,
+                azcopy_path=azcopy_path
+            )
+            self._chunk_sizes = self.image_tsv_file.get_chunk_size()
+        else:
+            raise ValueError("Invalid input! Please check the tsv filenames.")
+        assert len(self.image_tsv_file) == len(self.text_tsv_file), \
+            "Inconsistent size of Image/Text ({}/{}) data!".format(
+                len(self.image_tsv_file), len(self.text_tsv_file)
+            )
+    def build_label_offsets(self):
+        if self.metas is None:
+            return None
+        label_offsets = {}
+        offset = 1
+        for meta in self.metas:
+            print(meta)
+            print(label_offsets)
+            label_offsets[meta.source] = offset
+            offset += meta.num_classes
+        return label_offsets
+    def fetch_blob(self, idx):
+        # image_tsv, text_tsv = self.tsv_file_list[idx]
+        image_tsv = self.image_tsv_file.file_list[idx]
+        text_tsv = self.text_tsv_file.file_list[idx]
+        self.image_tsv_file.blob_storage.fetch_blob(image_tsv)
+        self.text_tsv_file.blob_storage.fetch_blob(text_tsv)
+    def get_chunk_sizes(self):
+        return self._chunk_sizes
+    def __getitem__(self, index: Union[int, Tuple[int, int]]):
+        if index is None:
+            import torch
+            return torch.tensor([], dtype=torch.float32), \
+                torch.tensor([], dtype=torch.int64), \
+                torch.tensor([], dtype=torch.int64)
+        items_image = self.image_tsv_file[index]
+        items_text = self.text_tsv_file[index]
+        assert items_text[0] == items_image[0], \
+            'keys do not match for image and text {} vs {}'.format(
+                items_text[0], items_image[0]
+            )
+        _, img = self._decode_image(items_image)
+        _, txt, label = self._decode_text(items_text)
+        if self.transform:
+            img = self.transform(img)
+        tokens = self.tokenize(
+            txt, padding='max_length', truncation=True, max_length=self.context_length,
+            return_tensors='pt'
+        ) if self.tokenize else txt
+        tokens['input_ids'].squeeze_()
+        tokens['attention_mask'].squeeze_()
+        return img, tokens, label
+    def _decode_image(self, items: Tuple[str, str]):
+        key = items[0]
+        image = Image.open(BytesIO(base64.b64decode(items[1]))).convert('RGB')
+        return key, image
+    def _decode_text(self, items: Tuple[str, Union[str, dict]]):
+        key = items[0]
+        text = ''
+        if self.text_format != 'json':
+            raise ValueError('Only support json format')
+        # Do some reasonable handing of occasionally bad data.
+        try:
+            js = json.loads(items[1])
+        except Exception as e:
+            # empty dictionary
+            js = {}
+            # Record the data error in the log.
+            logger.info("JSON parsing error on: " + items[1])
+            logger.info(str(e))
+            # do not raise the exception
+            # raise e
+            # put some text in and continue processing data (do not kill job)
+            sstr = items[1].find("\"")
+            if (sstr < 0):
+                sstr = 0
+            estr = items[1][sstr:].find("\"")
+            if (estr < 0):
+                estr = len(items[1])
+            text = items[1][sstr:estr]
+            if (len(text) < 2):
+                text = "A picture showing some content."
+            label = 0
+        if 'captions' in js:
+            captions = js['captions']
+            if isinstance(captions, list):
+                if self.num_captions == 1:
+                    text = random.choice(captions)
+                else:
+                    text = captions
+                    if len(captions) > self.num_captions:
+                        text = captions[:self.num_captions]
+            elif isinstance(captions, str):
+                text = captions
+            else:
+                raise ValueError('captions should be str or list')
+            label = 0
+        elif 'tags' in js:
+            text = prompt_engineering(js['tags'])
+            label = 0
+        elif 'task' in js and js['task'] == 'classification':
+            if (self.prompt_engineering):
+                text = prompt_engineering(js['class_name'])
+            else:
+                text = js['class_name']
+            label = js['class_id']
+            if (self.label_offsets is not None):
+                if (js['source'] in self.label_offsets):
+                    label += self.label_offsets[js['source']]
+        if (self.concat_queries):
+            if ('queries' in js) and (len(js['queries']) > 0):
+                q = ''
+                for item in js['queries']:
+                    q = q + item + ' '
+                text = q + ', ' + text
+        return key, text, label
+    def __len__(self):
+        return len(self.image_tsv_file)

MedImageInsight/ImageDataLoader/tsv_file.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import logging
+import gc
+import os
+import os.path as op
+import json
+from typing import List
+from .blob_storage import BlobStorage, disk_usage
+logger = logging.getLogger(__name__)
+def generate_lineidx(filein: str, idxout: str) -> None:
+    idxout_tmp = idxout + '.tmp'
+    with open(filein, 'r') as tsvin, open(idxout_tmp, 'w') as tsvout:
+        fsize = os.fstat(tsvin.fileno()).st_size
+        fpos = 0
+        while fpos != fsize:
+            tsvout.write(str(fpos) + "\n")
+            tsvin.readline()
+            fpos = tsvin.tell()
+    os.rename(idxout_tmp, idxout)
+def read_to_character(fp, c):
+    result = []
+    while True:
+        s = fp.read(32)
+        assert s != ''
+        if c in s:
+            result.append(s[: s.index(c)])
+            break
+        else:
+            result.append(s)
+    return ''.join(result)
+class TSVFile(object):
+    def __init__(self,
+                 tsv_file: str,
+                 if_generate_lineidx: bool = True,
+                 lineidx: str = None,
+                 class_selector: List[str] = None,
+                 blob_storage: BlobStorage = None):
+        self.tsv_file = tsv_file
+        self.lineidx = op.splitext(tsv_file)[0] + '.lineidx' \
+            if not lineidx else lineidx
+        self.linelist = op.splitext(tsv_file)[0] + '.linelist'
+        self.chunks = op.splitext(tsv_file)[0] + '.chunks'
+        self._fp = None
+        self._lineidx = None
+        self._sample_indices = None
+        self._class_boundaries = None
+        self._class_selector = class_selector
+        self._blob_storage = blob_storage
+        self._len = None
+        # the process always keeps the process which opens the file.
+        # If the pid is not equal to the currrent pid, we will re-open the file.
+        self.pid = None
+        # generate lineidx if not exist
+        if not op.isfile(self.lineidx) and if_generate_lineidx:
+            generate_lineidx(self.tsv_file, self.lineidx)
+    def __del__(self):
+        self.gcidx()
+        if self._fp:
+            self._fp.close()
+            # physically remove the tsv file if it is retrieved by BlobStorage
+            if self._blob_storage and 'azcopy' in self.tsv_file and os.path.exists(self.tsv_file):
+                try:
+                    original_usage = disk_usage('/')
+                    os.remove(self.tsv_file)
+                    logger.info("Purged %s (disk usage: %.2f%% => %.2f%%)" %
+                                 (self.tsv_file, original_usage, disk_usage('/') * 100))
+                except:
+                    # Known issue: multiple threads attempting to delete the file will raise a FileNotFound error.
+                    # TODO: try Threadling.Lock to better handle the race condition
+                    pass
+    def __str__(self):
+        return "TSVFile(tsv_file='{}')".format(self.tsv_file)
+    def __repr__(self):
+        return str(self)
+    def gcidx(self):
+        logger.debug('Run gc collect')
+        self._lineidx = None
+        self._sample_indices = None
+        #self._class_boundaries = None
+        return gc.collect()
+    def get_class_boundaries(self):
+        return self._class_boundaries
+    def num_rows(self, gcf=False):
+        if (self._len is None):
+            self._ensure_lineidx_loaded()
+            retval = len(self._sample_indices)
+            if (gcf):
+                self.gcidx()
+            self._len = retval
+        return self._len
+    def seek(self, idx: int):
+        self._ensure_tsv_opened()
+        self._ensure_lineidx_loaded()
+        try:
+            pos = self._lineidx[self._sample_indices[idx]]
+        except:
+            logger.info('=> {}-{}'.format(self.tsv_file, idx))
+            raise
+        self._fp.seek(pos)
+        return [s.strip() for s in self._fp.readline().split('\t')]
+    def seek_first_column(self, idx: int):
+        self._ensure_tsv_opened()
+        self._ensure_lineidx_loaded()
+        pos = self._lineidx[idx]
+        self._fp.seek(pos)
+        return read_to_character(self._fp, '\t')
+    def get_key(self, idx: int):
+        return self.seek_first_column(idx)
+    def __getitem__(self, index: int):
+        return self.seek(index)
+    def __len__(self):
+        return self.num_rows()
+    def _ensure_lineidx_loaded(self):
+        if self._lineidx is None:
+            logger.debug('=> loading lineidx: {}'.format(self.lineidx))
+            with open(self.lineidx, 'r') as fp:
+                lines = fp.readlines()
+                lines = [line.strip() for line in lines]
+                self._lineidx = [int(line) for line in lines]
+            # read the line list if exists
+            linelist = None
+            if op.isfile(self.linelist):
+                with open(self.linelist, 'r') as fp:
+                    linelist = sorted(
+                        [
+                            int(line.strip())
+                            for line in fp.readlines()
+                        ]
+                    )
+            if op.isfile(self.chunks):
+                self._sample_indices = []
+                self._class_boundaries = []
+                class_boundaries = json.load(open(self.chunks, 'r'))
+                for class_name, boundary in class_boundaries.items():
+                    start = len(self._sample_indices)
+                    if class_name in self._class_selector:
+                        for idx in range(boundary[0], boundary[1] + 1):
+                            # NOTE: potentially slow when linelist is long, try to speed it up
+                            if linelist and idx not in linelist:
+                                continue
+                            self._sample_indices.append(idx)
+                    end = len(self._sample_indices)
+                    self._class_boundaries.append((start, end))
+            else:
+                if linelist:
+                    self._sample_indices = linelist
+                else:
+                    self._sample_indices = list(range(len(self._lineidx)))
+    def _ensure_tsv_opened(self):
+        if self._fp is None:
+            if self._blob_storage:
+                self._fp = self._blob_storage.open(self.tsv_file)
+            else:
+                self._fp = open(self.tsv_file, 'r')
+            self.pid = os.getpid()
+        if self.pid != os.getpid():
+            logger.debug('=> re-open {} because the process id changed'.format(self.tsv_file))
+            self._fp = open(self.tsv_file, 'r')
+            self.pid = os.getpid()
+class CompositeTSVFile:
+    def __init__(self,
+                 file_list: List[str],
+                 root: str = '.',
+                 class_selector: List[str] = None,
+                 is_train: bool = True,
+                 sas_token_path: str = None,
+                 azcopy_path: str = None):
+        self.root = root
+        self.tsvs = None
+        self.chunk_sizes = None
+        self.accum_chunk_sizes = None
+        self._class_selector = class_selector
+        self._class_boundaries = None
+        self.initialized = False
+        assert isinstance(file_list, list)
+        self.blob_storage = BlobStorage(is_train, sas_token_path, azcopy_path)
+        self.file_list = self.blob_storage.register_local_tsv_paths(file_list)
+        logger.info('=> Init CompositeTSVFile...')
+        self.initialize()
+        logger.info('=> Init CompositeTSVFile Done...')
+    def get_key(self, index: int):
+        idx_source, idx_row = self._calc_chunk_idx_row(index)
+        k = self.tsvs[idx_source].get_key(idx_row)
+        return '_'.join([self.file_list[idx_source], k])
+    def get_class_boundaries(self):
+        return self._class_boundaries
+    def get_chunk_size(self):
+        return self.chunk_sizes
+    def num_rows(self):
+        return sum(self.chunk_sizes)
+    def _calc_chunk_idx_row(self, index: int):
+        idx_chunk = 0
+        idx_row = index
+        while index >= self.accum_chunk_sizes[idx_chunk]:
+            idx_chunk += 1
+            idx_row = index - self.accum_chunk_sizes[idx_chunk-1]
+        return idx_chunk, idx_row
+    def __getitem__(self, index: int):
+        idx_source, idx_row = self._calc_chunk_idx_row(index)
+        if idx_source not in self.blob_storage:
+            self.blob_storage[idx_source] = TSVFile(
+                op.join(self.root, self.file_list[idx_source]),
+                class_selector=self._class_selector,
+                blob_storage=self.blob_storage,
+                if_generate_lineidx=True
+            )
+        return self.blob_storage[idx_source].seek(idx_row)
+    def __len__(self):
+        return sum(self.chunk_sizes)
+    def initialize(self):
+        """
+        this function has to be called in init function if cache_policy is
+        enabled. Thus, let's always call it in init funciton to make it simple.
+        """
+        if self.initialized:
+            return
+        self.tsvs = [
+            TSVFile(
+                op.join(self.root, f),
+                class_selector=self._class_selector
+            ) for f in self.file_list
+        ]
+        logger.debug("=> Calculating chunk sizes ...")
+        self.chunk_sizes = [tsv.num_rows(gcf=True) for tsv in self.tsvs]
+        self.accum_chunk_sizes = [0]
+        for size in self.chunk_sizes:
+            self.accum_chunk_sizes += [self.accum_chunk_sizes[-1] + size]
+        self.accum_chunk_sizes = self.accum_chunk_sizes[1:]
+        if (
+            self._class_selector
+            and all([tsv.get_class_boundaries() for tsv in self.tsvs])
+        ):
+            """
+            Note: When using CompositeTSVFile, make sure that the classes contained in each
+            tsv file do not overlap. Otherwise, the class boundaries won't be correct.
+            """
+            self._class_boundaries = []
+            offset = 0
+            for tsv in self.tsvs:
+                boundaries = tsv.get_class_boundaries()
+                for bound in boundaries:
+                    self._class_boundaries.append((bound[0] + offset, bound[1] + offset))
+                offset += len(tsv)
+        self.initialized = True
+def load_list_file(fname: str) -> List[str]:
+    with open(fname, 'r') as fp:
+        lines = fp.readlines()
+    result = [line.strip() for line in lines]
+    if len(result) > 0 and result[-1] == '':
+        result = result[:-1]
+    return result

MedImageInsight/ImageDataLoader/zipdata.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os.path as op
+from zipfile import ZipFile, BadZipFile
+import torch.utils.data as data
+from PIL import Image
+from io import BytesIO
+import multiprocessing
+_VALID_IMAGE_TYPES = ['.jpg', '.jpeg', '.tiff', '.bmp', '.png']
+class ZipData(data.Dataset):
+    _IGNORE_ATTRS = {'_zip_file'}
+    def __init__(self, path, map_file,
+                 transform=None, target_transform=None,
+                 extensions=None):
+        self._path = path
+        if not extensions:
+            extensions = _VALID_IMAGE_TYPES
+        self._zip_file = ZipFile(path)
+        self.zip_dict = {}
+        self.samples = []
+        self.transform = transform
+        self.target_transform = target_transform
+        self.class_to_idx = {}
+        with open(map_file, 'r') as f:
+            for line in iter(f.readline, ""):
+                line = line.strip()
+                if not line:
+                    continue
+                cls_idx = [l for l in line.split('\t') if l]
+                if not cls_idx:
+                    continue
+                if (len(cls_idx) < 2):
+                    cls_idx = [l for l in line.split(' ') if l]
+                    if not cls_idx:
+                        continue
+                assert len(cls_idx) >= 2, "invalid line: {}".format(line)
+                idx = int(cls_idx[1])
+                cls = cls_idx[0]
+                del cls_idx
+                at_idx = cls.find('@')
+                assert at_idx >= 0, "invalid class: {}".format(cls)
+                cls = cls[at_idx + 1:]
+                if cls.startswith('/'):
+                    # Python ZipFile expects no root
+                    cls = cls[1:]
+                assert cls, "invalid class in line {}".format(line)
+                prev_idx = self.class_to_idx.get(cls)
+                assert prev_idx is None or prev_idx == idx, "class: {} idx: {} previously had idx: {}".format(
+                    cls, idx, prev_idx
+                )
+                self.class_to_idx[cls] = idx
+        for fst in self._zip_file.infolist():
+            fname = fst.filename
+            target = self.class_to_idx.get(fname)
+            if target is None:
+                continue
+            if fname.endswith('/') or fname.startswith('.') or fst.file_size == 0:
+                continue
+            ext = op.splitext(fname)[1].lower()
+            if ext in extensions:
+                self.samples.append((fname, target))
+        assert len(self), "No images found in: {} with map: {}".format(self._path, map_file)
+    def __repr__(self):
+        return 'ZipData({}, size={})'.format(self._path, len(self))
+    def __getstate__(self):
+        return {
+            key: val if key not in self._IGNORE_ATTRS else None
+            for key, val in self.__dict__.iteritems()
+        }
+    def __getitem__(self, index):
+        proc = multiprocessing.current_process()
+        pid = proc.pid # get pid of this process.
+        if pid not in self.zip_dict:
+            self.zip_dict[pid] = ZipFile(self._path)
+        zip_file = self.zip_dict[pid]
+        if index >= len(self) or index < 0:
+            raise KeyError("{} is invalid".format(index))
+        path, target = self.samples[index]
+        try:
+            sample = Image.open(BytesIO(zip_file.read(path))).convert('RGB')
+        except BadZipFile:
+            print("bad zip file")
+            return None, None
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return sample, target
+    def __len__(self):
+        return len(self.samples)

MedImageInsight/ImageEncoder/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .build import build_image_encoder
+from .coswin import *
+from .davit_v1 import *

MedImageInsight/ImageEncoder/build.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .registry import image_encoders
+from .registry import is_image_encoder
+def build_image_encoder(config_encoder, verbose, **kwargs):
+    model_name = config_encoder['NAME']
+    if model_name.startswith('cls_'):
+        model_name = model_name[4:]
+    if not is_image_encoder(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return image_encoders(model_name)(config_encoder, verbose, **kwargs)

MedImageInsight/ImageEncoder/coswin.py ADDED Viewed

	@@ -0,0 +1,779 @@

+# --------------------------------------------------------
+# CoSwin: Convolutional Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Bin Xiao
+# --------------------------------------------------------
+import logging
+import os
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from .registry import register_image_encoder
+logger = logging.getLogger(__name__)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_scale=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.gamma = 1.0
+        if layer_scale:
+            logger.info('=> enable layer scale')
+            self.gamma = nn.Parameter(
+                1e-4*torch.ones(dim), requires_grad=True
+            )
+        self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(self.gamma*x)
+        x = x + self.drop_path(self.gamma*self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
+                 use_checkpoint=False, layer_scale=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim, input_resolution=input_resolution,
+                num_heads=num_heads, window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop, attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                layer_scale=layer_scale
+            )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            # self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+            self.downsample = downsample(
+                input_resolution=input_resolution, patch_size=3, in_chans=dim, embed_dim=dim*2,
+                stride=2, padding=1, norm_layer=norm_layer
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+class ConvEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(
+        self,
+        input_resolution=(224,224),
+        patch_size=7,
+        in_chans=3,
+        embed_dim=64,
+        stride=4,
+        padding=2,
+        norm_layer=None
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_resolution = input_resolution
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else None
+    def forward(self, x):
+        if len(x.size()) == 3:
+            x = rearrange(
+                x, 'b (h w) c -> b c h w',
+                h=self.input_resolution[0],
+                w=self.input_resolution[1]
+            )
+        x = self.proj(x)
+        B, C, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm:
+            x = self.norm(x)
+        return x
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+    def __init__(self, img_size=224, patch_size=7, patch_padding=2, patch_stride=4, in_chans=3,
+                 num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+                 use_checkpoint=False, layer_scale=False, **kwargs):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        # self.patch_embed = PatchEmbed(
+        #     img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+        #     norm_layer=norm_layer if self.patch_norm else None)
+        self.patch_embed = ConvEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, padding=patch_padding,
+            norm_layer=norm_layer if self.patch_norm else None
+        )
+        img_size = to_2tuple(img_size)
+        patches_resolution = (
+            int(np.floor(float(img_size[0]+2*patch_padding-patch_size)/patch_stride+1)),
+            int(np.floor(float(img_size[0]+2*patch_padding-patch_size)/patch_stride+1))
+        )
+        num_patches = patches_resolution[0] * patches_resolution[1]
+        # num_patches = self.patch_embed.num_patches
+        # patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(
+                    patches_resolution[0] // (2 ** i_layer),
+                    patches_resolution[1] // (2 ** i_layer)
+                ),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=ConvEmbed if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                layer_scale=layer_scale
+            )
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    @property
+    def dim_out(self):
+        return self.num_features
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            logging.info(f'=> loading pretrained model {pretrained}')
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            self.from_state_dict(pretrained_dict, pretrained_layers, verbose)
+    def from_state_dict(self, pretrained_dict, pretrained_layers=[], verbose=True):
+        model_dict = self.state_dict()
+        stripped_key = lambda x: x[14:] if x.startswith('image_encoder.') else x
+        pretrained_dict = {
+            stripped_key(k): v for k, v in pretrained_dict.items()
+            if stripped_key(k) in model_dict.keys()
+        }
+        need_init_state_dict = {}
+        for k, v in pretrained_dict.items():
+            need_init = (
+                (
+                    k.split('.')[0] in pretrained_layers
+                    or pretrained_layers[0] == '*'
+                )
+                and 'relative_position_index' not in k
+                and 'attn_mask' not in k
+            )
+            if need_init:
+                if verbose:
+                    logger.info(f'=> init {k} from pretrained state dict')
+                if 'relative_position_bias_table' in k and v.size() != model_dict[k].size():
+                    relative_position_bias_table_pretrained = v
+                    relative_position_bias_table_current = model_dict[k]
+                    L1, nH1 = relative_position_bias_table_pretrained.size()
+                    L2, nH2 = relative_position_bias_table_current.size()
+                    if nH1 != nH2:
+                        logger.info(f"Error in loading {k}, passing")
+                    else:
+                        if L1 != L2:
+                            logger.info(
+                                '=> load_pretrained: resized variant: {} to {}'
+                                .format((L1, nH1), (L2, nH2))
+                            )
+                            S1 = int(L1 ** 0.5)
+                            S2 = int(L2 ** 0.5)
+                            relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                                relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                                size=(S2, S2),
+                                mode='bicubic')
+                            v = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)
+                if 'absolute_pos_embed' in k and v.size() != model_dict[k].size():
+                    absolute_pos_embed_pretrained = v
+                    absolute_pos_embed_current = model_dict[k]
+                    _, L1, C1 = absolute_pos_embed_pretrained.size()
+                    _, L2, C2 = absolute_pos_embed_current.size()
+                    if C1 != C1:
+                        logger.info(f"Error in loading {k}, passing")
+                    else:
+                        if L1 != L2:
+                            logger.info(
+                                '=> load_pretrained: resized variant: {} to {}'
+                                    .format((1, L1, C1), (1, L2, C2))
+                            )
+                            S1 = int(L1 ** 0.5)
+                            S2 = int(L2 ** 0.5)
+                            absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(-1, S1, S1, C1)
+                            absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(0, 3, 1, 2)
+                            absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate(
+                                absolute_pos_embed_pretrained, size=(S2, S2), mode='bicubic')
+                            v = absolute_pos_embed_pretrained_resized.permute(0, 2, 3, 1).flatten(1, 2)
+                need_init_state_dict[k] = v
+        self.load_state_dict(need_init_state_dict, strict=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose(1, 2))  # B C 1
+        x = torch.flatten(x, 1)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+@register_image_encoder
+def image_encoder(config_encoder, verbose, **kwargs):
+    spec = config_encoder['SPEC']
+    coswin = SwinTransformer(
+        img_size=config_encoder['IMAGE_SIZE'],
+        patch_size=spec['PATCH_SIZE'],
+        patch_padding=spec['PATCH_PADDING'],
+        patch_stride=spec['PATCH_STRIDE'],
+        in_chans=spec['IN_CHANS'],
+        num_classes=0,
+        embed_dim=spec['EMBED_DIM'],
+        depths=spec['DEPTHS'],
+        num_heads=spec['NUM_HEADS'],
+        window_size=spec['WINDOW_SIZE'],
+        mlp_ratio=spec['MLP_RATIO'],
+        qkv_bias=spec['QKV_BIAS'],
+        qk_scale=spec.get('QK_SCALE', None),
+        drop_rate=spec['DROP_RATE'],
+        drop_path_rate=spec['DROP_PATH_RATE'],
+        ape=spec['APE'],
+        patch_norm=spec['PATCH_NORM'],
+        layer_scale=spec.get('LAYER_SCALE', False),
+        use_checkpoint=spec.get('ENABLE_CHECKPOINT', False)
+    )
+    if config_encoder['LOAD_PRETRAINED']:
+        coswin.from_pretrained(
+            config_encoder['PRETRAINED'],
+            config_encoder['PRETRAINED_LAYERS'],
+            verbose
+        )
+    return coswin

MedImageInsight/ImageEncoder/davit_v1.py ADDED Viewed

	@@ -0,0 +1,727 @@

+import logging
+import os
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from collections import OrderedDict
+from einops import rearrange
+from timm.models.layers import DropPath, trunc_normal_
+# helper methods
+from .registry import register_image_encoder
+import mup.init
+from mup import MuReadout, set_base_shapes
+logger = logging.getLogger(__name__)
+class MySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._modules.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+class PreNorm(nn.Module):
+    def __init__(self, norm, fn, drop_path=None):
+        super().__init__()
+        self.norm = norm
+        self.fn = fn
+        self.drop_path = drop_path
+    def forward(self, x, *args, **kwargs):
+        shortcut = x
+        if self.norm != None:
+            x, size = self.fn(self.norm(x), *args, **kwargs)
+        else:
+            x, size = self.fn(x, *args, **kwargs)
+        if self.drop_path:
+            x = self.drop_path(x)
+        x = shortcut + x
+        return x, size
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.net = nn.Sequential(OrderedDict([
+            ("fc1", nn.Linear(in_features, hidden_features)),
+            ("act", act_layer()),
+            ("fc2", nn.Linear(hidden_features, out_features))
+        ]))
+    def forward(self, x, size):
+        return self.net(x), size
+class DepthWiseConv2d(nn.Module):
+    def __init__(
+            self,
+            dim_in,
+            kernel_size,
+            padding,
+            stride,
+            bias=True,
+    ):
+        super().__init__()
+        self.dw = nn.Conv2d(
+            dim_in, dim_in,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=dim_in,
+            stride=stride,
+            bias=bias
+        )
+    def forward(self, x, size):
+        B, N, C = x.shape
+        H, W = size
+        assert N == H * W
+        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
+        size = (x.size(-2), x.size(-1))
+        x = x.flatten(2).transpose(1, 2)
+        return x, size
+class ConvEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(
+            self,
+            patch_size=7,
+            in_chans=3,
+            embed_dim=64,
+            stride=4,
+            padding=2,
+            norm_layer=None,
+            pre_norm=True
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding
+        )
+        dim_norm = in_chans if pre_norm else embed_dim
+        self.norm = norm_layer(dim_norm) if norm_layer else None
+        self.pre_norm = pre_norm
+    def forward(self, x, size):
+        H, W = size
+        if len(x.size()) == 3:
+            if self.norm and self.pre_norm:
+                x = self.norm(x)
+            x = rearrange(
+                x, 'b (h w) c -> b c h w',
+                h=H, w=W
+            )
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm and not self.pre_norm:
+            x = self.norm(x)
+        return x, (H, W)
+class ChannelAttention(nn.Module):
+    def __init__(self, dim, base_dim, groups=8, base_groups=8, qkv_bias=True, dynamic_scale=True, standparam=True):
+        super().__init__()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.dynamic_scale = dynamic_scale
+        self.dim = dim
+        self.groups = groups
+        self.group_dim = dim // groups
+        self.base_dim = base_dim
+        self.base_groups = base_groups
+        self.base_group_dim = base_dim // base_groups
+        self.group_wm = self.group_dim / self.base_group_dim  # Width multiplier for each group.
+        self.standparam = standparam
+    def forward(self, x, size):
+        B, N, C = x.shape
+        assert C == self.dim
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # Shape: [B, groups, N, group_dim].
+        scale = N ** -0.5 if self.dynamic_scale else self.dim ** -0.5
+        # Change the scaling factor.
+        # Ref: examples/Transformer/model.py in muP.
+        # Note: We consider backward compatiblity and follow https://github.com/microsoft/mup/issues/18.
+        if self.standparam:
+            scale = N ** -0.5 if self.dynamic_scale else self.dim ** -0.5
+        else:
+            assert self.dynamic_scale  # Currently only support dynamic scale.
+            scale = N ** -0.5
+        q = q * scale
+        attention = q.transpose(-1, -2) @ k
+        attention = attention.softmax(dim=-1)
+        if not self.standparam:
+            # Follow https://github.com/microsoft/mup/issues/18.
+            attention = attention / self.group_wm
+        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x, size
+class ChannelBlock(nn.Module):
+    def __init__(self, dim, base_dim, groups, base_groups, mlp_ratio=4., qkv_bias=True,
+                 drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 conv_at_attn=True, conv_at_ffn=True, dynamic_scale=True, standparam=True):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        self.channel_attn = PreNorm(
+            norm_layer(dim),
+            ChannelAttention(dim, base_dim, groups=groups, base_groups=base_groups, qkv_bias=qkv_bias,
+                             dynamic_scale=dynamic_scale, standparam=standparam),
+            drop_path
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
+            drop_path
+        )
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.channel_attn(x, size)
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+def window_partition(x, window_size: int):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size: int, H: int, W: int):
+    B = windows.shape[0] // (H * W // window_size // window_size)
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    def __init__(self, dim, base_dim, num_heads, base_num_heads, window_size, qkv_bias=True, standparam=True):
+        super().__init__()
+        self.window_size = window_size
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.base_dim = base_dim
+        self.base_num_heads = base_num_heads
+        base_head_dim = base_dim // base_num_heads
+        # Change the scaling factor.
+        # Ref: examples/Transformer/model.py in muP.
+        # Note: We consider backward compatiblity and follow https://github.com/microsoft/mup/issues/17.
+        if standparam:
+            scale = float(head_dim) ** -0.5
+        else:
+            # TODO: Here we ensure backward compatibility, which may not be optimal.
+            #       We may add an argument called backward_comp. If it is set as False, we use
+            #          float(head_dim) ** -1 * math.sqrt(attn_mult)
+            #       as in the Transformer example in muP.
+            base_scale = float(base_head_dim) ** -0.5  # The same as scaling in standard parametrization.
+            head_wm = head_dim / base_head_dim  # Width multiplier for each head.
+            scale = base_scale / head_wm
+            # scale_1 = (float(base_head_dim) ** 0.5) * (float(head_dim) ** -1) # Equivalent implementation as shown in the muP paper.
+            # assert np.isclose(scale, scale_1)
+        self.scale = scale
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, size):
+        H, W = size
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        x = window_partition(x, self.window_size)
+        x = x.view(-1, self.window_size * self.window_size, C)
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = self.softmax(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        # merge windows
+        x = x.view(
+            -1, self.window_size, self.window_size, C
+        )
+        x = window_reverse(x, self.window_size, Hp, Wp)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        return x, size
+class SpatialBlock(nn.Module):
+    def __init__(self, dim, base_dim, num_heads, base_num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True, standparam=True):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
+        self.window_attn = PreNorm(
+            norm_layer(dim),
+            WindowAttention(dim, base_dim, num_heads, base_num_heads, window_size, qkv_bias=qkv_bias,
+                            standparam=standparam),
+            drop_path
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
+            drop_path
+        )
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.window_attn(x, size)
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+class DaViT(nn.Module):
+    """ DaViT: Dual-Attention Transformer
+    Args:
+        img_size (int | tuple(int)): Input image size. Default: 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of spatial and channel blocks in different stages. Default: (1, 1, 3, 1)
+        patch_size (tuple(int)): Patch sizes in different stages. Default: (7, 2, 2, 2)
+        patch_stride (tuple(int)): Patch strides in different stages. Default: (4, 2, 2, 2)
+        patch_padding (tuple(int)): Patch padding sizes in different stages. Default: (3, 0, 0, 0)
+        patch_prenorm (tuple(bool)): Use pre-normalization or not in different stages. Default: (False, False, False, False)
+        embed_dims (tuple(int)): Patch embedding dimension. Default: (64, 128, 192, 256)
+        base_embed_dims (tuple(int)): Patch embedding dimension (base case for muP). Default: (64, 128, 192, 256)
+        num_heads (tuple(int)): Number of attention heads in different layers. Default: (4, 8, 12, 16)
+        base_num_heads (tuple(int)): Number of attention heads in different layers (base case for muP). Default: (4, 8, 12, 16)
+        num_groups (tuple(int)): Number of groups in channel attention in different layers. Default: (3, 6, 12, 24)
+        base_num_groups (tuple(int)): Number of groups in channel attention in different layers (base case for muP). Default: (3, 6, 12, 24)
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        enable_checkpoint (bool): If True, enabling checkpoint. Default: False
+        conv_at_attn (bool): If True, add convolution layer before attention. Default: True
+        conv_at_ffn (bool): If True, add convolution layer before ffn. Default: True
+        dynamic_scale (bool): If True, scale of channel attention is respect to the number of tokens. Default: True
+        standparam (bool): Use standard parametrization or mu-parametrization. Default: True (i.e., use standard paramerization)
+    """
+    def __init__(
+            self,
+            img_size=224,
+            in_chans=3,
+            num_classes=1000,
+            depths=(1, 1, 3, 1),
+            patch_size=(7, 2, 2, 2),
+            patch_stride=(4, 2, 2, 2),
+            patch_padding=(3, 0, 0, 0),
+            patch_prenorm=(False, False, False, False),
+            embed_dims=(64, 128, 192, 256),
+            base_embed_dims=(64, 128, 192, 256),
+            num_heads=(3, 6, 12, 24),
+            base_num_heads=(3, 6, 12, 24),
+            num_groups=(3, 6, 12, 24),
+            base_num_groups=(3, 6, 12, 24),
+            window_size=7,
+            mlp_ratio=4.,
+            qkv_bias=True,
+            drop_path_rate=0.1,
+            norm_layer=nn.LayerNorm,
+            enable_checkpoint=False,
+            conv_at_attn=True,
+            conv_at_ffn=True,
+            dynamic_scale=True,
+            standparam=True
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.num_stages = len(self.embed_dims)
+        self.enable_checkpoint = enable_checkpoint
+        assert self.num_stages == len(self.num_heads) == len(self.num_groups)
+        num_stages = len(embed_dims)
+        self.img_size = img_size
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths) * 2)]
+        depth_offset = 0
+        convs = []
+        blocks = []
+        for i in range(num_stages):
+            conv_embed = ConvEmbed(
+                patch_size=patch_size[i],
+                stride=patch_stride[i],
+                padding=patch_padding[i],
+                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
+                embed_dim=self.embed_dims[i],
+                norm_layer=norm_layer,
+                pre_norm=patch_prenorm[i]
+            )
+            convs.append(conv_embed)
+            logger.info(f'=> Depth offset in stage {i}: {depth_offset}')
+            block = MySequential(
+                *[
+                    MySequential(OrderedDict([
+                        (
+                            'spatial_block', SpatialBlock(
+                                embed_dims[i],
+                                base_embed_dims[i],
+                                num_heads[i],
+                                base_num_heads[i],
+                                window_size,
+                                drop_path_rate=dpr[depth_offset + j * 2],
+                                qkv_bias=qkv_bias,
+                                mlp_ratio=mlp_ratio,
+                                conv_at_attn=conv_at_attn,
+                                conv_at_ffn=conv_at_ffn,
+                                standparam=standparam
+                            )
+                        ),
+                        (
+                            'channel_block', ChannelBlock(
+                                embed_dims[i],
+                                base_embed_dims[i],
+                                num_groups[i],
+                                base_num_groups[i],
+                                drop_path_rate=dpr[depth_offset + j * 2 + 1],
+                                qkv_bias=qkv_bias,
+                                mlp_ratio=mlp_ratio,
+                                conv_at_attn=conv_at_attn,
+                                conv_at_ffn=conv_at_ffn,
+                                dynamic_scale=dynamic_scale,
+                                standparam=standparam
+                            )
+                        )
+                    ])) for j in range(depths[i])
+                ]
+            )
+            blocks.append(block)
+            depth_offset += depths[i] * 2
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+        self.norms = norm_layer(self.embed_dims[-1])
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        if standparam:
+            self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        else:
+            self.head = MuReadout(self.embed_dims[-1], num_classes,
+                                  readout_zero_init=True)  # Follow examples/ResNet/resnet.py in muP.
+        if torch.cuda.is_available():
+            self.device = torch.device(type="cuda", index=0)
+        else:
+            self.device = torch.device(type="cpu")
+    def custom_init_weights(self, use_original_init=True):
+        self.use_original_init = use_original_init
+        logger.info('Custom init: {}'.format('original init' if self.use_original_init else 'muP init'))
+        self.apply(self._custom_init_weights)
+    @property
+    def dim_out(self):
+        return self.embed_dims[-1]
+    def _custom_init_weights(self, m):
+        # Customized initialization for weights.
+        if self.use_original_init:
+            # Original initialization.
+            # Note: This is not SP init. We do not implement SP init here.
+            custom_trunc_normal_ = trunc_normal_
+            custom_normal_ = nn.init.normal_
+        else:
+            # muP.
+            custom_trunc_normal_ = mup.init.trunc_normal_
+            custom_normal_ = mup.init.normal_
+        # These initializations will overwrite the existing inializations from the modules and adjusted by set_base_shapes().
+        if isinstance(m, MuReadout):
+            pass  # Note: MuReadout is already zero initialized due to readout_zero_init=True.
+        elif isinstance(m, nn.Linear):
+            custom_trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            custom_normal_(m.weight, std=0.02)
+            for name, _ in m.named_parameters():
+                if name in ['bias']:
+                    nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):  # Follow P24 Layernorm Weights and Biases.
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):  # Follow P24 Layernorm Weights and Biases.
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0)
+    def _try_remap_keys(self, pretrained_dict):
+        remap_keys = {
+            "conv_embeds": "convs",
+            "main_blocks": "blocks",
+            "0.cpe.0.proj": "spatial_block.conv1.fn.dw",
+            "0.attn": "spatial_block.window_attn.fn",
+            "0.cpe.1.proj": "spatial_block.conv2.fn.dw",
+            "0.mlp": "spatial_block.ffn.fn.net",
+            "1.cpe.0.proj": "channel_block.conv1.fn.dw",
+            "1.attn": "channel_block.channel_attn.fn",
+            "1.cpe.1.proj": "channel_block.conv2.fn.dw",
+            "1.mlp": "channel_block.ffn.fn.net",
+            "0.norm1": "spatial_block.window_attn.norm",
+            "0.norm2": "spatial_block.ffn.norm",
+            "1.norm1": "channel_block.channel_attn.norm",
+            "1.norm2": "channel_block.ffn.norm"
+        }
+        full_key_mappings = {}
+        for k in pretrained_dict.keys():
+            old_k = k
+            for remap_key in remap_keys.keys():
+                if remap_key in k:
+                    logger.info(f'=> Repace {remap_key} with {remap_keys[remap_key]}')
+                    k = k.replace(remap_key, remap_keys[remap_key])
+            full_key_mappings[old_k] = k
+        return full_key_mappings
+    def from_state_dict(self, pretrained_dict, pretrained_layers=[], verbose=True):
+        model_dict = self.state_dict()
+        stripped_key = lambda x: x[14:] if x.startswith('image_encoder.') else x
+        full_key_mappings = self._try_remap_keys(pretrained_dict)
+        pretrained_dict = {
+            stripped_key(full_key_mappings[k]): v.to(self.device) for k, v in pretrained_dict.items()
+            if stripped_key(full_key_mappings[k]) in model_dict.keys()
+        }
+        need_init_state_dict = {}
+        for k, v in pretrained_dict.items():
+            need_init = (
+                    k.split('.')[0] in pretrained_layers
+                    or pretrained_layers[0] == '*'
+            )
+            if need_init:
+                if verbose:
+                    logger.info(f'=> init {k} from pretrained state dict')
+                need_init_state_dict[k] = v.to(self.device)
+        self.load_state_dict(need_init_state_dict, strict=False)
+    def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            logger.info(f'=> loading pretrained model {pretrained}')
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            self.from_state_dict(pretrained_dict, pretrained_layers, verbose)
+    def forward_features(self, x):
+        input_size = (x.size(2), x.size(3))
+        for conv, block in zip(self.convs, self.blocks):
+            x, input_size = conv(x, input_size)
+            if self.enable_checkpoint:
+                x, input_size = checkpoint.checkpoint(block, x, input_size)
+            else:
+                x, input_size = block(x, input_size)
+        x = self.avgpool(x.transpose(1, 2))
+        x = torch.flatten(x, 1)
+        x = self.norms(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+def create_encoder(config_encoder):
+    spec = config_encoder['SPEC']
+    standparam = spec.get('STANDPARAM', True)
+    if standparam:
+        # Dummy values for muP parameters.
+        base_embed_dims = spec['DIM_EMBED']
+        base_num_heads = spec['NUM_HEADS']
+        base_num_groups = spec['NUM_GROUPS']
+    else:
+        base_embed_dims = spec['BASE_DIM_EMBED']
+        base_num_heads = spec['BASE_NUM_HEADS']
+        base_num_groups = spec['BASE_NUM_GROUPS']
+    davit = DaViT(
+        num_classes=config_encoder['NUM_CLASSES'],
+        depths=spec['DEPTHS'],
+        embed_dims=spec['DIM_EMBED'],
+        base_embed_dims=base_embed_dims,
+        num_heads=spec['NUM_HEADS'],
+        base_num_heads=base_num_heads,
+        num_groups=spec['NUM_GROUPS'],
+        base_num_groups=base_num_groups,
+        patch_size=spec['PATCH_SIZE'],
+        patch_stride=spec['PATCH_STRIDE'],
+        patch_padding=spec['PATCH_PADDING'],
+        patch_prenorm=spec['PATCH_PRENORM'],
+        drop_path_rate=spec['DROP_PATH_RATE'],
+        img_size=config_encoder['IMAGE_SIZE'],
+        window_size=spec.get('WINDOW_SIZE', 7),
+        enable_checkpoint=spec.get('ENABLE_CHECKPOINT', False),
+        conv_at_attn=spec.get('CONV_AT_ATTN', True),
+        conv_at_ffn=spec.get('CONV_AT_FFN', True),
+        dynamic_scale=spec.get('DYNAMIC_SCALE', True),
+        standparam=standparam,
+    )
+    return davit
+def create_mup_encoder(config_encoder):
+    def gen_config(config, wm):
+        new_config = copy.deepcopy(config)
+        for name in ['DIM_EMBED', 'NUM_HEADS', 'NUM_GROUPS']:
+            base_name = 'BASE_' + name
+            new_values = [round(base_value * wm) for base_value in
+                          config['SPEC'][base_name]]  # New value = base value * width multiplier.
+            logger.info(f'config["SPEC"]["{name}"]: {new_config["SPEC"][name]} -> {new_values}')
+            new_config['SPEC'][name] = new_values
+        return new_config
+    logger.info('muP: Create models and set base shapes')
+    logger.info('=> Create model')
+    model = create_encoder(config_encoder)
+    logger.info('=> Create base model')
+    base_config = gen_config(config_encoder, wm=1.0)
+    base_model = create_encoder(base_config)
+    logger.info('=> Create delta model')
+    delta_config = gen_config(config_encoder, wm=2.0)
+    delta_model = create_encoder(delta_config)
+    logger.info('=> Set base shapes in model for training')
+    set_base_shapes(model, base=base_model, delta=delta_model)
+    return model
+@register_image_encoder
+def image_encoder(config_encoder, verbose, **kwargs):
+    spec = config_encoder['SPEC']
+    standparam = spec.get('STANDPARAM', True)
+    if standparam:
+        logger.info('Create model with standard parameterization')
+        model = create_encoder(config_encoder)
+        model.custom_init_weights(use_original_init=True)
+    else:
+        logger.info('Create model with mu parameterization')
+        model = create_mup_encoder(config_encoder)
+        model.custom_init_weights(use_original_init=False)
+    logger.info('Load model from pretrained checkpoint')
+    if config_encoder['LOAD_PRETRAINED']:
+        model.from_pretrained(
+            config_encoder['PRETRAINED'],
+            config_encoder['PRETRAINED_LAYERS'],
+            verbose
+        )
+    return model

MedImageInsight/ImageEncoder/registry.py ADDED Viewed

	@@ -0,0 +1,18 @@

+_image_encoders = {}
+def register_image_encoder(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _image_encoders[model_name] = fn
+    return fn
+def image_encoders(model_name):
+    return _image_encoders[model_name]
+def is_image_encoder(model_name):
+    return model_name in _image_encoders

MedImageInsight/LangEncoder/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .build import build_lang_encoder
+from .build import build_tokenizer
+from .transformer import *
+# from .hf_model import *
+# from .zcode import *
+# from .pretrain import *
+# from .tulrv6 import *
+# from .t5 import *

MedImageInsight/LangEncoder/build.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import logging
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers import AutoTokenizer
+from .registry import lang_encoders
+from .registry import is_lang_encoder
+logger = logging.getLogger(__name__)
+def build_lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
+    model_name = config_encoder['NAME']
+    if model_name.endswith('pretrain'):
+        model_name = 'pretrain'
+    if not is_lang_encoder(model_name):
+        raise ValueError(f'Unknown model: {model_name}')
+    return lang_encoders(model_name)(config_encoder, tokenizer, verbose, **kwargs)
+def post_process_clip(text):
+    text['input_ids'].squeeze_() # torch.Size([1, 77])
+    text['attention_mask'].squeeze_() # torch.Size([1, 77])
+    return text
+def build_tokenizer(config_encoder):
+    tokenizer = None
+    os.environ['TOKENIZERS_PARALLELISM'] = 'false' # 'true', avoid hanging
+    if config_encoder['TOKENIZER'] == 'clip':
+        os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+        pretrained_tokenizer = config_encoder.get(
+            'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32'
+        )
+        # print(pretrained_tokenizer)
+        tokenizer = CLIPTokenizer.from_pretrained(pretrained_tokenizer)
+        tokenizer.add_special_tokens({'cls_token': tokenizer.eos_token})
+        tokenizer.post_process = post_process_clip
+    elif config_encoder['TOKENIZER'] == 'clip-fast':
+        pretrained_tokenizer = config_encoder.get(
+            'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32'
+        )
+        tokenizer = CLIPTokenizerFast.from_pretrained(pretrained_tokenizer, from_slow=True)
+        tokenizer.post_process = post_process_clip
+    elif config_encoder['TOKENIZER'] == 'zcodepp':
+        from .zcodepp import ZCodeppTokenizer
+        tokenizer = ZCodeppTokenizer(config_encoder)
+        tokenizer.post_process = lambda x: x
+    elif config_encoder['TOKENIZER'] == 'zcode':
+        from transformers import XLMRobertaTokenizer
+        tokenizer = XLMRobertaTokenizer.from_pretrained(config_encoder['PRETRAINED_TOKENIZER'])
+    elif config_encoder['TOKENIZER'] == 'tulrv6':
+        from .modeling_tulrv6 import TULRv6Tokenizer
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+        pretrained_tokenizer = config_encoder.get(
+            'PRETRAINED_TOKENIZER', 'tulrv6-base'
+        )
+        tokenizer = TULRv6Tokenizer.from_pretrained(pretrained_tokenizer)
+        # tokenizer.post_process = post_process_clip
+    else:
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+        pretrained_tokenizer = config_encoder.get('PRETRAINED_TOKENIZER', '')
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_tokenizer
+            if pretrained_tokenizer else config_encoder['TOKENIZER']
+        )
+        tokenizer.post_process = post_process_clip
+        # Extra configurations.
+        if 'TOKENIZER_CONF' in config_encoder:
+            tokenizer_conf = config_encoder['TOKENIZER_CONF']
+            num_pretrained_tokens = len(tokenizer)
+            addition_special_tokens_config = tokenizer_conf.get('ADDITIONAL_SPECIAL_TOKENS', None)
+            if addition_special_tokens_config == 'od+cap':
+                # Note: We still keep the additional special tokens from original tokenizer when we add new special tokens.
+                #       This is to make sure tokenizer.additional_special_tokens afterwards includes original additional special tokens.
+                special_tokens_dict = {
+                    'additional_special_tokens': \
+                        tokenizer.additional_special_tokens + \
+                        ['<od>','</od>','<cap>','</cap>'] + \
+                        [f'<loc_{x}>' for x in range(tokenizer_conf.get('NUM_LOCATION_TOKENS', 0))]
+                }
+                tokenizer.add_special_tokens(special_tokens_dict)
+            elif isinstance(addition_special_tokens_config, list):
+                special_tokens_dict = {
+                    'additional_special_tokens': \
+                        tokenizer.additional_special_tokens + \
+                        addition_special_tokens_config + \
+                        [f'<loc_{x}>' for x in range(tokenizer_conf.get('NUM_LOCATION_TOKENS', 0))]+
+                    [f'<time_{x}>' for x in range(
+                        tokenizer_conf.get('NUM_TIME_TOKENS', 0))]
+                }
+                tokenizer.add_special_tokens(special_tokens_dict)
+            elif addition_special_tokens_config is not None:
+                raise ValueError('ADDITIONAL_SPECIAL_TOKENS type error')
+            num_current_tokens = len(tokenizer)
+            logger.info(f'{num_pretrained_tokens} tokens in pretrained tokenizer => {num_current_tokens} in current tokenizer')
+            logger.info(f'All special tokens in tokenizer: {tokenizer.additional_special_tokens}')
+    return tokenizer

MedImageInsight/LangEncoder/registry.py ADDED Viewed

	@@ -0,0 +1,18 @@

+_lang_encoders = {}
+def register_lang_encoder(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _lang_encoders[model_name] = fn
+    return fn
+def lang_encoders(model_name):
+    return _lang_encoders[model_name]
+def is_lang_encoder(model_name):
+    return model_name in _lang_encoders

MedImageInsight/LangEncoder/transformer.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from timm.models.layers import DropPath, trunc_normal_
+from .registry import register_lang_encoder
+from ..Utils import is_main_process
+from ..Utils import register_norm_module
+logger = logging.getLogger(__name__)
+@register_norm_module
+class LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        pdtype = x.dtype
+        x = x.float()
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x.to(pdtype) + self.bias
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path: float = 0.0):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor = None):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) \
+            if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+            attn_mask=self.attn_mask
+        )[0]
+    def forward(self, x: torch.Tensor, key_padding_mask: torch.Tensor = None):
+        x = x + self.drop_path(self.attention(self.ln_1(x), key_padding_mask=key_padding_mask))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+class Transformer(nn.Module):
+    def __init__(self,
+                 context_length: int,
+                 vocab_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 drop_path: float = 0.0,
+                 autogressive: bool =True,
+                 key_padding_token: int = 0,
+                 ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        self.key_padding_token = key_padding_token
+        self.context_length = context_length
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, width)
+        )
+        self.width = width
+        self.layers = layers
+        self.autogressive = autogressive
+        attn_mask = self.build_attention_mask() if autogressive else None
+        dpr = [x.item() for x in torch.linspace(0, drop_path, layers)]  # stochastic depth decay rule
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
+                for i in range(layers)
+            ]
+        )
+        self.ln_final = LayerNorm(width)
+        trunc_normal_(self.positional_embedding, std=.02)
+        # nn.init.normal_(self.token_embedding, std=.02)
+        trunc_normal_(self.token_embedding.weight, std=.02)
+        self.apply(self._init_weights)
+    @property
+    def dim_out(self):
+        return self.width
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, nn.Conv2d)):
+            if is_main_process():
+                logger.info('=> init weight of Linear/Conv2d from trunc norm')
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                if is_main_process():
+                    logger.info('=> init bias of Linear/Conv2d to zeros')
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
+            nn.init.constant_(m.bias, 0)
+    def load_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            logging.info(f'=> loading pretrained model {pretrained}')
+            model_dict = self.state_dict()
+            pretrained_dict = {
+                k: v for k, v in pretrained_dict.items()
+                if k in model_dict.keys()
+            }
+            need_init_state_dict = {}
+            for k, v in pretrained_dict.items():
+                need_init = (
+                    k.split('.')[0] in pretrained_layers
+                    or pretrained_layers[0] == '*'
+                )
+                if need_init:
+                    if verbose:
+                        logging.info(f'=> init {k} from {pretrained}')
+                    need_init_state_dict[k] = v
+            self.load_state_dict(need_init_state_dict, strict=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {
+            'positional_embedding',
+            'token_embedding',
+        }
+    def forward(self, input_ids, attention_mask=None):
+        input_ids = input_ids.to(self.positional_embedding.device, non_blocking=True)
+        # Here we generate key_padding_mask using attention_mask instead of using
+        # a predefined key_padding_token (e.g., 0). This is to solve a discrepancy
+        # between Transformer 4.16.2 and 4.25.1, since Transformers 4.16.2 uses token id 0
+        # for padding but 4.25.1 uses EOS token (token id 49407) for padding.
+        key_padding_mask = (attention_mask == 0) if not self.autogressive else None
+        # a True value indicates that the corresponding key value will be ignored for the purpose of attention
+        x = self.token_embedding(input_ids)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for block in self.resblocks:
+            x = block(x, key_padding_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        return {'last_hidden_state': x}
+@register_lang_encoder
+def lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
+    transformer = Transformer(
+        context_length=config_encoder['CONTEXT_LENGTH'],
+        vocab_size=tokenizer.vocab_size,
+        width=config_encoder['WIDTH'],
+        layers=config_encoder['LAYERS'],
+        heads=config_encoder['HEADS'],
+        autogressive=config_encoder.get('AUTOGRESSIVE', True),
+        key_padding_token=config_encoder.get('KEY_PADDING_TOKEN', 0),
+    )
+    if config_encoder['LOAD_PRETRAINED']:
+        transformer.load_pretrained()
+    return transformer

MedImageInsight/UniCLModel.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import pathlib
+import tempfile
+import logging
+import os
+import copy
+import torch
+from torch import nn
+from timm.models.layers import trunc_normal_
+from .ImageEncoder import build_image_encoder
+from .LangEncoder import build_lang_encoder
+from .LangEncoder import build_tokenizer
+import mup.init
+from mup import set_base_shapes
+from safetensors.torch import load_file
+logger = logging.getLogger(__name__)
+class UniCLModel(nn.Module):
+    def __init__(self, config: dict):
+        super().__init__()
+        self.conf_lang_encoder = config['LANG_ENCODER']
+        self.tokenizer = build_tokenizer(self.conf_lang_encoder)
+        self.lang_encoder = build_lang_encoder(self.conf_lang_encoder, self.tokenizer, config['VERBOSE'])
+        dim_projection = config['UNICL_MODEL']['DIM_PROJECTION']
+        if hasattr(self.lang_encoder, 'dim_out'):
+            dim_out = self.lang_encoder.dim_out
+        else:
+            with torch.no_grad():
+                dim_out = self.lang_encoder(
+                    torch.zeros(1,1).type(torch.LongTensor)
+                )['last_hidden_state'].size(2)
+        self.lang_projection = nn.Parameter(torch.empty(dim_out, dim_projection))
+        self.conf_image_encoder = config['IMAGE_ENCODER']
+        self.image_encoder = build_image_encoder(self.conf_image_encoder, config['VERBOSE'])
+        self.image_projection = nn.Parameter(
+            torch.empty(self.image_encoder.dim_out, dim_projection)
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]))
+        if torch.cuda.is_available():
+            self.device = torch.device(type="cuda", index=0)
+        else:
+            self.device = torch.device(type="cpu")
+    def custom_init_weights(self, use_original_init=True):
+        self.use_original_init = use_original_init
+        logger.info('Custom init: {}'.format('original init' if self.use_original_init else 'muP init'))
+        if self.use_original_init:
+            # Original initialization.
+            # Note: This is not SP init. We do not implement SP init here.
+            custom_trunc_normal_ = trunc_normal_  # Note: This should be the same as torch.nn.init.trunc_normal_
+        else:
+            # muP.
+            custom_trunc_normal_ = mup.init.trunc_normal_
+        custom_trunc_normal_(self.lang_projection, std=.02)
+        custom_trunc_normal_(self.image_projection, std=.02)
+    def _convert_old_weights(self, model_dict):
+        model_dict_updated = {}
+        for k, v in model_dict.items():
+            if k.startswith('visual.'):
+                model_dict_updated['image_encoder.'+k[7:]] = v
+            elif k.startswith('text.'):
+                model_dict_updated['lang_encoder.'+k[5:]] = v
+            elif k == 'vision_projection':
+                model_dict_updated['image_projection'] = v
+            elif k == 'text_projection':
+                model_dict_updated['lang_projection'] = v
+            else:
+                model_dict_updated[k] = v
+        return model_dict_updated
+    def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
+        if not os.path.isfile(pretrained):
+            logger.warning(f'=> Pretrained model ({pretrained}) is not a file, skip init weight')
+            return
+        ## Load SafeTensors Version of Pretrained Model
+        pretrained_dict = load_file(pretrained)
+        logger.info(f'=> Loading pretrained model {pretrained}')
+        model_dict = self.state_dict()
+        pretrained_dict = self._convert_old_weights(pretrained_dict)
+        ## To ensure cuda is mapped to all weights in the SafeTensors version model
+        pretrained_dict = {
+            k: v.to(self.device) for k, v in pretrained_dict.items()
+        }
+        need_init_state_dict = {}
+        image_encoder_state_dict = {}
+        for k, v in pretrained_dict.items():
+            need_init = (
+                k.split('.')[0] in pretrained_layers
+                or pretrained_layers[0] == '*'
+            )
+            if need_init:
+                if k.startswith('image_encoder.'):
+                    image_encoder_state_dict[k] = v.to(self.device)
+                else:
+                    if verbose:
+                        logger.info(f'=> init {k} from {pretrained}')
+                    if 'positional_embedding' in k and v.size() != model_dict[k].size():
+                        positional_embedding_pretrained = v
+                        positional_embedding_current = model_dict[k]
+                        L1, nH1 = positional_embedding_pretrained.size()
+                        L2, nH2 = positional_embedding_current.size()
+                        if nH1 != nH2:
+                            logger.info(f"Error in loading {k}, passing")
+                        else:
+                            if L1 != L2:
+                                logger.info(
+                                    '=> load_pretrained: resized variant: {} to {}'
+                                        .format((L1, nH1), (L2, nH2))
+                                )
+                                posemb = positional_embedding_pretrained.float()
+                                posemb_grid = posemb.unsqueeze(dim=0).permute(0, 2, 1)
+                                posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=L2, mode='linear')
+                                posemb_grid = posemb_grid.permute(0, 2, 1).squeeze(dim=0)
+                                v = posemb_grid
+                    need_init_state_dict[k] = v.to(self.device)
+        self.image_encoder.from_state_dict(image_encoder_state_dict, ['*'], verbose)
+        self.load_state_dict(need_init_state_dict, strict=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        no_weight_decay = {'logit_scale'}
+        if hasattr(self.lang_encoder, 'no_weight_decay'):
+            for k in self.lang_encoder.no_weight_decay():
+                no_weight_decay.add('lang_encoder.'+k)
+        if hasattr(self.image_encoder, 'no_weight_decay'):
+            for k in self.visual.no_weight_decay():
+                no_weight_decay.add('image_encoder.'+k)
+        return no_weight_decay
+    @property
+    def dtype(self):
+        return self.logit_scale.dtype
+    def encode_image(self, image, norm=True):
+        x = self.image_encoder.forward_features(image)
+        x = x @ self.image_projection
+        if norm:
+            x = x / x.norm(dim=-1, keepdim=True)
+        return x
+    def encode_text(self, text, norm=True):
+        x = self.lang_encoder(**text)
+        x = x['last_hidden_state']
+        if self.conf_lang_encoder['TOKENIZER'] == 'clip':
+            x = x[torch.arange(x.size(0)), text['input_ids'].argmax(dim=-1)]
+        else:
+            x = x[:, 0]
+        x = x @ self.lang_projection
+        if norm:
+            x = x / x.norm(dim=-1, keepdim=True)
+        return x
+    def forward(self, image, text):
+        features_image = self.encode_image(image)
+        features_text = self.encode_text(text)
+        # cosine similarity as logits
+        T = self.logit_scale.exp()
+        return features_image, features_text, T
+def create_model(config):
+    model = UniCLModel(config)
+    return model
+def create_mup_model(config):
+    def gen_config(config, wm):
+        # TODO: Currently only support the case that all UniCL, lang encoder, and image encoder use
+        #       mu parameterization. This requirement can be relaxed.
+        assert (not config['UNICL_MODEL']['STANDPARAM']) and \
+               (not config['LANG_ENCODER']['STANDPARAM']) and \
+               (not config['IMAGE_ENCODER']['SPEC']['STANDPARAM'])
+        new_config = copy.deepcopy(config)
+        logger.info(f'Generate config with width mult = {wm}:')
+        # Generate config for UniCL head.
+        new_config_section = new_config['UNICL_MODEL']
+        new_config_section['STANDPARAM'] = True  # Use standard parameterization when determining base shapes.
+        for name in ['DIM_PROJECTION']:
+            base_name = 'BASE_' + name
+            new_values = round(new_config_section[base_name] * wm)  # New value = base value * width multiplier.
+            logger.info(f'config["UNICL_MODEL"]["{name}"]: {new_config_section[name]} -> {new_values}')
+            new_config_section[name] = new_values
+        # Generate config for lang encoder.
+        new_config_section = new_config['LANG_ENCODER']
+        new_config_section['STANDPARAM'] = True
+        for name in ['WIDTH', 'HEADS']:
+            base_name = 'BASE_' + name
+            new_values = round(new_config_section[base_name] * wm)  # New value = base value * width multiplier.
+            logger.info(f'config["LANG_ENCODER"]["{name}"]: {new_config_section[name]} -> {new_values}')
+            new_config_section[name] = new_values
+        # Generate config for image encoder.
+        new_config_section = new_config['IMAGE_ENCODER']['SPEC']
+        new_config_section['STANDPARAM'] = True
+        for name in ['DIM_EMBED', 'NUM_HEADS', 'NUM_GROUPS']:
+            base_name = 'BASE_' + name
+            new_values = [round(base_value * wm) for base_value in new_config_section[base_name]]  # New value = base value * width multiplier.
+            logger.info(f'config["IMAGE_ENCODER"]["SPEC"]["{name}"]: {new_config_section[name]} -> {new_values}')
+            new_config_section[name] = new_values
+        return new_config
+    logger.info('muP: Create models and set base shapes')
+    logger.info('=> Create model')
+    model = create_model(config)
+    # Temporarily remove the lang and image encoders from model to prevent from
+    # setting the base shape for these encoders again.
+    lang_encoder, image_encoder = model.lang_encoder, model.image_encoder
+    model.lang_encoder, model.image_encoder = None, None
+    logger.info('=> Create base model')
+    base_config = gen_config(config, wm=1.0)
+    base_model = create_model(base_config)
+    del base_model.lang_encoder, base_model.image_encoder
+    logger.info('=> Create delta model')
+    delta_config = gen_config(config, wm=2.0)
+    delta_model = create_model(delta_config)
+    del delta_model.lang_encoder, delta_model.image_encoder
+    logger.info('=> Set base shapes in model for training')
+    set_base_shapes(model, base=base_model, delta=delta_model)
+    # Restore the lang and image encoders in the model.
+    model.lang_encoder, model.image_encoder = lang_encoder, image_encoder
+    return model
+def build_unicl_model(config, **kwargs):
+    standparam = config['UNICL_MODEL'].get('STANDPARAM', True)
+    if standparam:
+        logger.info('Create model with standard parameterization')
+        model = create_model(config)
+        use_original_init = True
+    else:
+        logger.info('Create model with mu parameterization')
+        model = create_mup_model(config)
+        use_original_init = False
+    # Initialize other parameters.
+    model.custom_init_weights(use_original_init=use_original_init)
+    if config['UNICL_MODEL']['LOAD_PRETRAINED']:
+        pretrained_path = config['UNICL_MODEL']['PRETRAINED']
+        from .Distributed.Utils import is_valid_url, download_file
+        if is_valid_url(pretrained_path):
+            with tempfile.TemporaryDirectory() as tmp_path:
+                file_local_path = pathlib.Path(tmp_path) / 'base_model.pt'
+                download_file(pretrained_path, file_local_path)
+                model.from_pretrained(str(file_local_path), config['UNICL_MODEL']['PRETRAINED_LAYERS'], config['VERBOSE'])
+        else:
+            model.from_pretrained(pretrained_path, config['UNICL_MODEL']['PRETRAINED_LAYERS'], config['VERBOSE'])
+    return model

MedImageInsight/Utils/Arguments.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import argparse
+import json
+import logging
+import os
+import re
+import yaml
+logger = logging.getLogger(__name__)
+def add_env_parser_to_yaml():
+    """
+    Adding ability of resolving environment variables to the yaml SafeLoader.
+    Environment variables in the form of "${<env_var_name>}" can be resolved as strings.
+    If the <env_var_name> is not in the env, <env_var_name> itself would be used.
+    E.g.:
+    config:
+      username: admin
+      password: ${SERVICE_PASSWORD}
+      service: https://${SERVICE_HOST}/service
+    """
+    loader = yaml.SafeLoader
+    env_pattern = re.compile(r".*?\${(.*?)}.*?")
+    def env_constructor(loader, node):
+        value = loader.construct_scalar(node)
+        for group in env_pattern.findall(value):
+            value = value.replace(f"${{{group}}}", os.environ.get(group, group))
+        return value
+    yaml.add_implicit_resolver("!ENV", env_pattern, Loader=loader)
+    yaml.add_constructor("!ENV", env_constructor, Loader=loader)
+def load_config_dict_to_opt(opt, config_dict, splitter='.', log_new=False):
+    """
+    Load the key, value pairs from config_dict to opt, overriding existing values in opt
+    if there is any.
+    """
+    if not isinstance(config_dict, dict):
+        raise TypeError("Config must be a Python dictionary")
+    for k, v in config_dict.items():
+        k_parts = k.split(splitter)
+        pointer = opt
+        for k_part in k_parts[:-1]:
+            if '[' in k_part and ']' in k_part:
+                # for the format "a.b[0][1].c: d"
+                k_part_splits = k_part.split('[')
+                k_part = k_part_splits.pop(0)
+                pointer = pointer[k_part]
+                for i in k_part_splits:
+                    assert i[-1] == ']'
+                    pointer = pointer[int(i[:-1])]
+            else:
+                if k_part not in pointer:
+                    pointer[k_part] = {}
+                pointer = pointer[k_part]
+            assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict."
+        if '[' in k_parts[-1] and ']' in k_parts[-1]:
+            k_part_splits = k_parts[-1].split('[')
+            k_part = k_part_splits.pop(0)
+            pointer = pointer[k_part]
+            for i in k_part_splits[:-1]:
+                assert i[-1] == ']'
+                pointer = pointer[int(i[:-1])]
+            assert k_part_splits[-1][-1] == ']'
+            ori_value = pointer[int(k_part_splits[-1][:-1])]
+            pointer[int(k_part_splits[-1][:-1])] = v
+        else:
+            ori_value = pointer.get(k_parts[-1])
+            pointer[k_parts[-1]] = v
+        if ori_value:
+            logger.warning(f"Overrided {k} from {ori_value} to {v}")
+        elif log_new:
+            logger.warning(f"Added {k}: {v}")
+def load_opt_from_config_files(conf_files):
+    """
+    Load opt from the config files, settings in later files can override those in previous files.
+    Args:
+        conf_files (list): a list of config file paths
+    Returns:
+        dict: a dictionary of opt settings
+    """
+    opt = {}
+    for conf_file in conf_files:
+        with open(conf_file, encoding='utf-8') as f:
+            # config_dict = yaml.safe_load(f)
+            config_dict = yaml.unsafe_load(f)
+        load_config_dict_to_opt(opt, config_dict)
+    return opt
+def load_opt_command(args):
+    parser = argparse.ArgumentParser(description='MainzTrain: Pretrain or fine-tune models for NLP tasks.')
+    parser.add_argument('command', help='Command: train/evaluate/train-and-evaluate')
+    parser.add_argument('--conf_files', nargs='+', required=True, help='Path(s) to the MainzTrain config file(s).')
+    parser.add_argument('--user_dir', help='Path to the user defined module for tasks (models, criteria), optimizers, and lr schedulers.')
+    parser.add_argument('--config_overrides', nargs='*', help='Override parameters on config with a json style string, e.g. {"<PARAM_NAME_1>": <PARAM_VALUE_1>, "<PARAM_GROUP_2>.<PARAM_SUBGROUP_2>.<PARAM_2>": <PARAM_VALUE_2>}. A key with "." updates the object in the corresponding nested dict. Remember to escape " in command line.')
+    cmdline_args = parser.parse_args() if not args else parser.parse_args(args)
+    add_env_parser_to_yaml()
+    opt = load_opt_from_config_files(cmdline_args.conf_files)
+    if cmdline_args.config_overrides:
+        config_overrides_string = ' '.join(cmdline_args.config_overrides)
+        config_overrides_string = os.path.expandvars(config_overrides_string)
+        logger.warning(f"Command line config overrides: {config_overrides_string}")
+        config_dict = yaml.safe_load(config_overrides_string)
+        load_config_dict_to_opt(opt, config_dict)
+    # combine cmdline_args into opt dictionary
+    for key, val in cmdline_args.__dict__.items():
+        if val is not None:
+            opt[key] = val
+    return opt, cmdline_args
+def save_opt_to_json(opt, conf_file):
+    with open(conf_file, 'w', encoding='utf-8') as f:
+        json.dump(opt, f, indent=4)
+def save_opt_to_yaml(opt, conf_file):
+    with open(conf_file, 'w', encoding='utf-8') as f:
+        yaml.dump(opt, f)

MedImageInsight/Utils/GeneraUtils.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import math
+import logging
+import copy
+import itertools
+import random
+from collections.abc import Iterable, Iterator
+import torch
+from torch._C import default_generator
+import torch.distributed as dist
+import time
+from functools import wraps, partial
+logger = logging.getLogger(__name__)
+class ObjectView(object):
+    def __init__(self, d):
+        self.__dict__ = d
+class AverageMeter(object):
+    """Computes and stores the average and current value."""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1, decay=0):
+        self.val = val
+        if decay:
+            alpha = math.exp(-n / decay)  # exponential decay over 100 updates
+            self.sum = alpha * self.sum + (1 - alpha) * val * n
+            self.count = alpha * self.count + (1 - alpha) * n
+        else:
+            self.sum += val * n
+            self.count += n
+        self.avg = self.sum / self.count
+    def getstate(self):
+        return {'val': self.val,
+                'avg': self.avg,
+                'sum': self.sum,
+                'count': self.count}
+    def setstate(self, state):
+        self.val = state['val']
+        self.avg = state['avg']
+        self.sum = state['sum']
+        self.count = state['count']
+def move_batch_to_device(batch, device):
+    """
+    Move the batch to the device.
+    It should be called before feeding the batch to the model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+        device (torch.device): device to move the batch to
+    Returns:
+        return_batch: same type as the input batch with internal tensors moved to device
+    """
+    if torch.is_tensor(batch):
+        return_batch = batch.to(device)
+    elif isinstance(batch, list):
+        return_batch = [move_batch_to_device(t, device) for t in batch]
+    elif isinstance(batch, tuple):
+        return_batch = tuple(move_batch_to_device(t, device) for t in batch)
+    elif isinstance(batch, dict):
+        return_batch = {}
+        for k in batch:
+            return_batch[k] = move_batch_to_device(batch[k], device)
+    else:
+        logger.debug(f"Can not move type {type(batch)} to device. Skipping it in the batch.")
+        return_batch = batch
+    return return_batch
+def cast_batch_to_dtype(batch, dtype):
+    """
+    Cast the float32 tensors in a batch to a specified torch dtype.
+    It should be called before feeding the batch to the FP16 DeepSpeed model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+    Returns:
+        return_batch: same type as the input batch with internal float32 tensors casted to the specified dtype.
+    """
+    if torch.is_tensor(batch):
+        if torch.is_floating_point(batch):
+            return_batch = batch.to(dtype)
+        else:
+            return_batch = batch
+    elif isinstance(batch, list):
+        return_batch = [cast_batch_to_dtype(t, dtype) for t in batch]
+    elif isinstance(batch, tuple):
+        return_batch = tuple(cast_batch_to_dtype(t, dtype) for t in batch)
+    elif isinstance(batch, dict):
+        return_batch = {}
+        for k in batch:
+            return_batch[k] = cast_batch_to_dtype(batch[k], dtype)
+    else:
+        logger.debug(f"Can not cast type {type(batch)} to {dtype}. Skipping it in the batch.")
+        return_batch = batch
+    return return_batch
+def cast_batch_to_half(batch):
+    """
+    Cast the float32 tensors in a batch to float16.
+    It should be called before feeding the batch to the FP16 DeepSpeed model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+    Returns:
+        return_batch: same type as the input batch with internal float32 tensors casted to float16
+    """
+    return cast_batch_to_dtype(batch, torch.float16)
+def cast_batch_to_bf16(batch):
+    """
+    Cast the float32 tensors in a batch to bfloat16.
+    It should be called before feeding the batch to the FP16 DeepSpeed model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+    Returns:
+        return_batch: same type as the input batch with internal float32 tensors casted to bfloat16
+    """
+    return cast_batch_to_dtype(batch, torch.bfloat16)
+# copied from MainzSpeech/moe_tools
+def peek_first_item_from_iterator(it):
+    # extract first item from iterator
+    first_item = next(it)
+    # create iterator with the first item added back in
+    new_it = itertools.chain([copy.deepcopy(first_item)], it)
+    return first_item, new_it
+# copied from MainzSpeech/moe_tools
+def generate_dummy_batch(it):
+    """
+    Generates a dummy batch by peeking at given iterable or iterator on rank 0,
+    then broadcast dummy_batch to all other ranks.
+    """
+    from mpi4py import MPI
+    assert isinstance(it, Iterable) or isinstance(it, Iterator)
+    if isinstance(it, Iterable):
+        it = iter(it)
+    if MPI.COMM_WORLD.Get_rank() == 0:
+        dummy_batch, it = peek_first_item_from_iterator(it)
+    else:
+        dummy_batch = None
+    dummy_batch = MPI.COMM_WORLD.bcast(dummy_batch, root=0)
+    assert dummy_batch is not None
+    return dummy_batch, it
+def retry_on_failure(func=None, *, max_retries=3, on_error_func=None, on_retry_func=None, raise_err_func=None, sleep_time=30, error_types=(Exception,)):
+    """
+    Decorator utility to retry a function, this decorator must be used without arguments (@retry_on_failure) or with all named arguments (@retry_on_failure(max_retries=10)).
+    Args:
+        max_retries (int): The number of retries to perform, in addition to the initial retry. Defaults to 3.
+        sleep_time (int): The time in seconds to wait before the next retry. Defaults to 30.
+        error_types (Tuple[type]): a tuple of exception types which are used to except any error being retried, if the exception that is thrown is not an instance of one of these types, the function is not retried. Defaults to (Exception,) which covers all exceptions.
+        on_retry_func (callable(num_retries)): A function with a single argument, the number of retries done so far. This function is called just before any retry. Defaults to a function logging `num_retries`.
+        on_error_func (callable(num_retries)): A function with a single argument, the number of retries done in total. This function is called after `max_retries` has been tried. Defaults to a function logging `num_retries`.
+        raise_err_func (callable(err)): A function with a single argument, the exception that was thrown. This function is called after `max_retries` has been tried. Defaults to raising the error.
+    """
+    if on_error_func is None:
+        def on_error_func(retried_times):
+            logger.warning(f"Failed after retrying {retried_times} times")
+    if on_retry_func is None:
+        def on_retry_func(idx):
+            logger.warning(f"Retrying on failure {idx}")
+    if raise_err_func is None:
+        def raise_err_func(err):
+            raise err
+    if func is None:
+        return partial(
+            retry_on_failure,
+            max_retries=max_retries,
+            on_error_func=on_error_func,
+            on_retry_func=on_retry_func,
+            raise_err_func=raise_err_func,
+            sleep_time=sleep_time,
+            error_types=error_types,
+        )
+    @wraps(func)
+    def decorator(*args, **kwargs):
+        num_retries = 0
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except error_types as err:
+                num_retries += 1
+                on_retry_func(num_retries)
+                if num_retries > max_retries:
+                    on_error_func(num_retries)
+                    raise_err_func(err)
+                time.sleep(sleep_time)
+    return decorator
+class TemporaryRngState:
+    '''
+    Context manager for working with a temporary random number generator (RNG) state.
+    The constructor gets a random number from the Python RNG that is used as
+    (part of) the seed for the temporary RNG
+    and then stores the current RNG state to restore the it later on.
+    If add_rank_to_seed=True, the GPU rank is added to the seed.
+    This is useful to initialize MoE models
+    where the experts on different GPUs should be initialized independently.
+    Note that this feature requires torch.distributed to be initialized.
+    On enter, the context managers sets the RNG state to the random seed created in the constructor
+    to establish a temporary RNG state.
+    On exit, the context manager resets the RNG state to the previously remembered state.
+    Thereby, any RNG operations executed with this context manager
+    do not affect the global, non-temporary RNG state.
+    However, the usage of this context manager does advance the Python RNG
+    since it uses that RNG to generate the random seed in the constructor.
+    The context manager resets the Python RNG state and
+    the PyTorch RNG state for CPU and GPU (if cuda is initialized).
+    It does not currently reset the numpy RNG state.
+    '''
+    def __init__(self, add_rank_to_seed=False):
+        self.seed = random.randrange(2**32)
+        if add_rank_to_seed and dist.is_initialized():
+            self.seed += dist.get_rank()
+        self.python_rng_state = random.getstate()
+        self.torch_rng_state = torch.get_rng_state()
+        if torch.cuda.is_initialized():
+            self.torch_rng_state_cuda = torch.cuda.get_rng_state()
+    def __enter__(self):
+        # increment seed for different RNGs to avoid correlation
+        # in the (very unlikely) case that the different RNGs
+        # use the exact same algorithm
+        random.seed(self.seed)
+        # do not call torch.maunal_seed here, because that sets the seed of all GPUs
+        default_generator.manual_seed(self.seed + 1)
+        if torch.cuda.is_initialized():
+            torch.cuda.manual_seed(self.seed + 2)  # only set seed of default cuda device
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        random.setstate(self.python_rng_state)
+        torch.set_rng_state(self.torch_rng_state)
+        if torch.cuda.is_initialized():
+            torch.cuda.set_rng_state(self.torch_rng_state_cuda)

MedImageInsight/Utils/GlobalExceptHook.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import sys
+import logging
+logger = logging.getLogger(__name__)
+_orig_except_hook = None
+def _global_except_hook(exctype, value, traceback):
+    """Catches an unhandled exception and call MPI_Abort()."""
+    try:
+        if _orig_except_hook:
+            _orig_except_hook(exctype, value, traceback)
+        else:
+            sys.__excepthook__(exctype, value, traceback)
+    finally:
+        import mpi4py.MPI
+        rank = mpi4py.MPI.COMM_WORLD.Get_rank()
+        logger.warning("******************************************")
+        logger.warning("MainzTrainer:")
+        logger.warning(f"   Uncaught exception on rank {rank}.")
+        logger.warning("   Calling MPI_Abort() to shut down MPI...")
+        logger.warning("******************************************")
+        logging.shutdown()
+        try:
+            import mpi4py.MPI
+            mpi4py.MPI.COMM_WORLD.Abort(1)
+        except Exception as e:
+            # Something is completely broken...
+            # There's nothing we can do any more
+            sys.stderr.write("Sorry, failed to stop MPI and the process may hang.\n")
+            sys.stderr.flush()
+            raise e
+def add_hook():
+    """
+    Add a global hook function that captures all unhandled exceptions.
+    The function calls MPI_Abort() to force all processes abort.
+    An MPI runtime is expected to kill all of its child processes
+    if one of them exits abnormally or without calling `MPI_Finalize()`.
+    However, when a Python program run on `mpi4py`, the MPI runtime
+    often fails to detect a process failure, and the rest of the processes
+    hang infinitely.
+    See https://github.com/chainer/chainermn/issues/236 and
+    https://mpi4py.readthedocs.io/en/stable/mpi4py.run.html for more
+    information.
+    """
+    global _orig_except_hook
+    if _orig_except_hook is not None:
+        logger.warning("GlobalExceptHook.add_hook() seems to be called multiple times. Ignoring.")
+        return
+    logger.info("Adding global except hook for the distributed job to shutdown MPI if unhandled exception is raised on some of the ranks.")
+    _orig_except_hook = sys.excepthook
+    sys.excepthook = _global_except_hook

MedImageInsight/Utils/MPIAdapter.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import logging
+from mpi4py import MPI
+import os
+import re
+import subprocess
+import torch
+logger = logging.getLogger(__name__)
+class MPIAdapter:
+    """
+    MPIAdapter automatically detects and analyzes the training environment for distributed training
+    and offers methods to set up distributed training jobs.
+    For example, it determines whether training happens on AML, Philly, or locally.
+    It also determines variables such as the world size and the rank of each GPU.
+    """
+    def __init__(self, set_env_vars=True, master_address=None, port='55551'):
+        local_address = '127.0.0.1'
+        default_torch_distributed_port = str(port)  # chosen arbitrarily
+        if 'OMPI_COMM_WORLD_SIZE' not in os.environ:
+            # application was started without MPI
+            # default to single node with single process
+            self.env_info = 'no MPI'
+            self.world_size = 1
+            self.local_size = 1
+            self.rank = 0
+            self.local_rank = 0
+            self.master_address = local_address
+            self.master_port = default_torch_distributed_port
+        else:
+            # application was started with MPI
+            # get MPI parameters
+            self.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+            self.local_size = int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
+            self.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+            self.local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+            if master_address is not None:
+                self.master_address = master_address
+                self.master_port = default_torch_distributed_port
+                self.env_info = 'manually set master ip'
+            elif 'PHILLY_CONTAINER_IP' in os.environ:
+                # application is running on Philly
+                # read environment variables on master node and broadcast via MPI
+                self.env_info = 'philly'
+                if self.rank == 0:
+                    self.master_address = os.environ['PHILLY_CONTAINER_IP']
+                    self.master_port = os.environ['PHILLY_CONTAINER_PORT_RANGE_START']
+                else:
+                    self.master_address = None
+                    self.master_port = None
+                self.master_address = MPI.COMM_WORLD.bcast(self.master_address, root=0)
+                self.master_port = MPI.COMM_WORLD.bcast(self.master_port, root=0)
+            elif "AMLK8S_NUM_WORKER" in os.environ or "AZ_CMK8S_JOB_WORK_DIR" in os.environ:
+                # application is running on AMLK8S (ITP)
+                # read master address from a specific file.
+                self.env_info = 'AMLK8S (ITP)'
+                # from: https://k8s-wiki.azureml.com/faq.html
+                regexp = r"[\s\S]*export[\s]*DLTS_SD_worker0_IP=([0-9.]+)[\s|s]*"
+                with open("/dlts-runtime/env/init.env", 'r') as f:
+                    line = f.read()
+                match = re.match(regexp, line)
+                if match:
+                    self.master_address = str(match.group(1))
+                else:
+                    # Did not find master node ip in file. It must be a single-node
+                    # debugging job with custom "mpirun" command
+                    assert self.world_size == self.local_size, \
+                        "It's not a single-node debugging job on AMLK8S (ITP), but no master ip is found in file."
+                    self.env_info = 'single-node AMLK8S (ITP) debugging job'
+                    self.master_address = local_address
+                self.master_port = default_torch_distributed_port
+            elif 'AZ_BATCH_MASTER_NODE' in os.environ:
+                # application is running on multiple nodes on AML
+                self.env_info = 'multi-node AML'
+                master_node_params = os.environ['AZ_BATCH_MASTER_NODE'].split(':')
+                self.master_address = master_node_params[0]
+                self.master_port = default_torch_distributed_port
+            elif self.world_size == self.local_size:
+                # application is running with MPI on single node
+                self.env_info = 'single-node AML or other MPI environment'
+                self.master_address = local_address
+                self.master_port = default_torch_distributed_port
+            else:
+                # multi-node MPI environment, but not Philly or AML
+                # we use "hostname -I" command on rank 0 to get the master address
+                self.env_info = 'multi-node other MPI environment'
+                if self.rank == 0:
+                    hostname_cmd = ["hostname -I"]
+                    result = subprocess.check_output(hostname_cmd, shell=True)
+                    self.master_address = result.decode('utf-8').split()[0]
+                    self.master_port = default_torch_distributed_port
+                else:
+                    self.master_address = None
+                    self.master_port = None
+                self.master_address = MPI.COMM_WORLD.bcast(self.master_address, root=0)
+                self.master_port = MPI.COMM_WORLD.bcast(self.master_port, root=0)
+        self.init_method_url = f'tcp://{self.master_address}:{self.master_port}'
+        if set_env_vars:
+            self._set_env_vars()
+    def log_info(self):
+        """
+        Logs information about distributed training environment.
+        """
+        # use logger.warning because MainzTrain has a hidden convention
+        # of not printing logger.info messages on processes with rank > 0
+        logger.warning('----------------')
+        logger.warning('MPI Adapter data')
+        logger.warning('----------------')
+        logger.warning(f'environment info: {self.env_info}')
+        logger.warning(f'init method url: {self.init_method_url}')
+        logger.warning(f'world size: {self.world_size}')
+        logger.warning(f'local size: {self.local_size}')
+        logger.warning(f'rank: {self.rank}')
+        logger.warning(f'local rank: {self.local_rank}')
+        logger.warning(f'master address: {self.master_address}')
+        logger.warning(f'master port: {self.master_port}')
+        logger.warning('----------------')
+    def init_process_group(self, backend):
+        """
+        Initializes the default PyTorch distributed process group.
+        """
+        # use logger.warning because MainzTrain has a hidden convention
+        # of not printing logger.info messages on processes with rank > 0
+        logger.warning('trying to initialize process group ...')
+        torch.distributed.init_process_group(backend=backend,
+                                             init_method=self.init_method_url,
+                                             world_size=self.world_size,
+                                             rank=self.rank)
+        logger.warning('process group initialized')
+    def _set_env_vars(self):
+        """
+        Sets environment variables for world size, rank, local rank, master addr, and master port.
+        """
+        os.environ['WORLD_SIZE'] = str(self.world_size)
+        os.environ['RANK'] = str(self.rank)
+        os.environ["LOCAL_RANK"] = str(self.local_rank)
+        os.environ['MASTER_ADDR'] = self.master_address
+        os.environ['MASTER_PORT'] = self.master_port

MedImageInsight/Utils/Utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import logging
+import os
+import torch
+import torch.distributed as dist
+import yaml
+from fvcore.nn import FlopCountAnalysis
+from fvcore.nn import flop_count_table
+from fvcore.nn import flop_count_str
+logger = logging.getLogger(__name__)
+NORM_MODULES = [
+    torch.nn.BatchNorm1d,
+    torch.nn.BatchNorm2d,
+    torch.nn.BatchNorm3d,
+    torch.nn.SyncBatchNorm,
+    # NaiveSyncBatchNorm inherits from BatchNorm2d
+    torch.nn.GroupNorm,
+    torch.nn.InstanceNorm1d,
+    torch.nn.InstanceNorm2d,
+    torch.nn.InstanceNorm3d,
+    torch.nn.LayerNorm,
+    torch.nn.LocalResponseNorm,
+]
+def register_norm_module(cls):
+    NORM_MODULES.append(cls)
+    return cls
+def is_main_process():
+    rank = 0
+    if 'OMPI_COMM_WORLD_SIZE' in os.environ:
+        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+    return rank == 0
+@torch.no_grad()
+def analysis_model(model, dump_input, verbose=False):
+    model.eval()
+    flops = FlopCountAnalysis(model, dump_input)
+    total = flops.total()
+    model.train()
+    params_total = sum(p.numel() for p in model.parameters())
+    params_learned = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    logger.info(f"flop count table:\n {flop_count_table(flops)}")
+    if verbose:
+        logger.info(f"flop count str:\n {flop_count_str(flops)}")
+    logger.info(f"  Total flops: {total/1000/1000:.3f}M,")
+    logger.info(f"  Total params: {params_total/1000/1000:.3f}M,")
+    logger.info(f"  Learned params: {params_learned/1000/1000:.3f}M")
+    return total, flop_count_table(flops), flop_count_str(flops)
+def load_config_dict_to_opt(opt, config_dict, splitter='.'):
+    """
+    Load the key, value pairs from config_dict to opt, overriding existing values in opt
+    if there is any.
+    """
+    if not isinstance(config_dict, dict):
+        raise TypeError("Config must be a Python dictionary")
+    for k, v in config_dict.items():
+        k_parts = k.split(splitter)
+        pointer = opt
+        for k_part in k_parts[:-1]:
+            if k_part not in pointer:
+                pointer[k_part] = {}
+            pointer = pointer[k_part]
+            assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict."
+        ori_value = pointer.get(k_parts[-1])
+        pointer[k_parts[-1]] = v
+        if ori_value:
+            print(f"Overrided {k} from {ori_value} to {pointer[k_parts[-1]]}")
+def load_opt_from_config_file(conf_file):
+    """
+    Load opt from the config file.
+    Args:
+        conf_file: config file path
+    Returns:
+        dict: a dictionary of opt settings
+    """
+    opt = {}
+    with open(conf_file, encoding='utf-8') as f:
+        config_dict = yaml.safe_load(f)
+        load_config_dict_to_opt(opt, config_dict)
+    return opt
+def cast_batch_to_dtype(batch, dtype):
+    """
+    Cast the float32 tensors in a batch to a specified torch dtype.
+    It should be called before feeding the batch to the FP16 DeepSpeed model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+    Returns:
+        return_batch: same type as the input batch with internal float32 tensors casted to the specified dtype.
+    """
+    if torch.is_tensor(batch):
+        if torch.is_floating_point(batch):
+            return_batch = batch.to(dtype)
+        else:
+            return_batch = batch
+    elif isinstance(batch, list):
+        return_batch = [cast_batch_to_dtype(t, dtype) for t in batch]
+    elif isinstance(batch, tuple):
+        return_batch = tuple(cast_batch_to_dtype(t, dtype) for t in batch)
+    elif isinstance(batch, dict):
+        return_batch = {}
+        for k in batch:
+            return_batch[k] = cast_batch_to_dtype(batch[k], dtype)
+    else:
+        logger.debug(f"Can not cast type {type(batch)} to {dtype}. Skipping it in the batch.")
+        return_batch = batch
+    return return_batch
+def cast_batch_to_half(batch):
+    """
+    Cast the float32 tensors in a batch to float16.
+    It should be called before feeding the batch to the FP16 DeepSpeed model.
+    Args:
+        batch (torch.tensor or container of torch.tensor): input batch
+    Returns:
+        return_batch: same type as the input batch with internal float32 tensors casted to float16
+    """
+    return cast_batch_to_dtype(batch, torch.float16)

MedImageInsight/Utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .Utils import analysis_model
+from .Utils import is_main_process
+from .Utils import register_norm_module
+from .Utils import NORM_MODULES
+from .Utils import load_config_dict_to_opt
+from .Utils import load_opt_from_config_file
+from .Utils import cast_batch_to_half

MedImageInsight/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .UniCLModel import build_unicl_model
+__all__ = [
+    'build_od_model',
+    'build_unicl_model',
+    'build_tokenizer_from_name',
+    'get_image_preprocess',
+    'build_unicl_matching_model'
+]