from transformers import AutoFeatureExtractor, AutoModel import torch from torchvision.transforms.functional import to_pil_image from einops import rearrange, reduce from skops import hub_utils import matplotlib.pyplot as plt import seaborn as sns import gradio as gr import os import glob import pickle setups = ['ResNet-50', 'ViT', 'DINO-ResNet-50', 'DINO-ViT'] embedder_names = ['microsoft/resnet-50', 'google/vit-base-patch16-224', 'Ramos-Ramos/dino-resnet-50', 'facebook/dino-vitb16'] gam_names = ['emb-gam-resnet', 'emb-gam-vit', 'emb-gam-dino-resnet', 'emb-gam-dino'] embedder_to_setup = dict(zip(embedder_names, setups)) gam_to_setup = dict(zip(gam_names, setups)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') embedders = {} for name in embedder_names: embedder = {} embedder['feature_extractor'] = AutoFeatureExtractor.from_pretrained(name) embedder['model'] = AutoModel.from_pretrained(name).eval().to(device) if 'resnet-50' in name: embedder['num_patches_side'] = 7 embedder['embedding_postprocess'] = lambda x: rearrange(x.last_hidden_state, 'b d h w -> b (h w) d') else: embedder['num_patches_side'] = embedder['model'].config.image_size // embedder['model'].config.patch_size embedder['embedding_postprocess'] = lambda x: x.last_hidden_state[:, 1:] embedders[embedder_to_setup[name]] = embedder gams = {} for name in gam_names: if not os.path.exists(name): os.mkdir(name) hub_utils.download(repo_id=f'Ramos-Ramos/{name}', dst=name) with open(f'{name}/model.pkl', 'rb') as infile: gams[gam_to_setup[name]] = pickle.load(infile) labels = [ 'tench', 'English springer', 'cassette player', 'chain saw', 'church', 'French horn', 'garbage truck', 'gas pump', 'golf ball', 'parachute' ] def visualize(input_img, visual_emb_gam_setups, show_scores, show_cbars): '''Visualizes the patch contributions to all labels of one or more visual Emb-GAMs''' if not visual_emb_gam_setups: fig = plt.Figure() return fig, fig patch_contributions = {} # get patch contributions per Emb-GAM for setup in visual_emb_gam_setups: # prepare embedding model embedder_setup = embedders[setup] feature_extractor = embedder_setup['feature_extractor'] embedding_postprocess = embedder_setup['embedding_postprocess'] num_patches_side = embedder_setup['num_patches_side'] # prepare GAM gam = gams[setup] # get patch embeddings inputs = { k: v.to(device) for k, v in feature_extractor(input_img, return_tensors='pt').items() } with torch.no_grad(): patch_embeddings = embedding_postprocess( embedder_setup['model'](**inputs) ).cpu()[0] # get patch emebddings patch_contributions[setup] = ( gam.coef_ \ @ patch_embeddings.T.numpy() \ + gam.intercept_.reshape(-1, 1) / (num_patches_side ** 2) ).reshape(-1, num_patches_side, num_patches_side) # plot heatmaps multiple_setups = len(visual_emb_gam_setups) > 1 # set up figure fig, axs = plt.subplots( len(visual_emb_gam_setups), 11, figsize=(20, round(10/4 * len(visual_emb_gam_setups))) ) gs_ax = axs[0, 0] if multiple_setups else axs[0] gs = gs_ax.get_gridspec() ax_rm = axs[:, 0] if multiple_setups else [axs[0]] for ax in ax_rm: ax.remove() ax_orig_img = fig.add_subplot(gs[:, 0] if multiple_setups else gs[0]) # plot original image ax_orig_img.imshow(input_img) ax_orig_img.axis('off') # plot patch contributions axs_maps = axs[:, 1:] if multiple_setups else [axs[1:]] for i, setup in enumerate(visual_emb_gam_setups): vmin = patch_contributions[setup].min() vmax = patch_contributions[setup].max() for j in range(10): ax = axs_maps[i][j] sns.heatmap( patch_contributions[setup][j], ax=ax, square=True, vmin=vmin, vmax=vmax, cbar=show_cbars ) if show_scores: ax.set_xlabel(f'{patch_contributions[setup][j].sum():.2f}') if j == 0: ax.set_ylabel(setup) if i == 0: ax.set_title(labels[j]) ax.set_xticks([]) ax.set_yticks([]) plt.tight_layout() return fig description = 'Visualize the patch contributions of [visual Emb-GAMs](https://huggingface.co/models?other=visual%20emb-gam) to class labels.' article = '''An extension of [Emb-GAMs](https://arxiv.org/abs/2209.11799), visual Emb-GAMs classify images by embedding images, taking intermediate representations correponding to different spatial regions, summing these up, and predicting a class label from the sum using a GAM. The use of a sum of embeddings allows us to visualize which regions of an image contributed positive or negatively to each class score. No paper yet, but you can refer to these tweets: - [Tweet #1](https://twitter.com/patrick_j_ramos/status/1586992857969147904?s=20&t=5-j5gKK0FpZOgzR_9Wdm1g) - [Tweet #2](https://twitter.com/patrick_j_ramos/status/1602187142062804992?s=20&t=roTFXfMkHHYVoCuNyN-AUA) Also, check out the original [Emb-GAM paper](https://arxiv.org/abs/2209.11799). ```bibtex @article{singh2022emb, title={Emb-GAM: an Interpretable and Efficient Predictor using Pre-trained Language Models}, author={Singh, Chandan and Gao, Jianfeng}, journal={arXiv preprint arXiv:2209.11799}, year={2022} } ``` ''' demo = gr.Interface( fn=visualize, inputs=[ gr.Image(shape=(224, 224), type='pil', label='Input image'), gr.CheckboxGroup(setups, value=setups, label='Visual Emb-GAM'), gr.Checkbox(label='Show scores'), gr.Checkbox(label='Show color bars') ], outputs=[ gr.Plot(label='Patch contributions'), ], examples=[[path,setups,False,False] for path in glob.glob('examples/*')], title='Visual Emb-GAM Probing', description=description, article=article, examples_per_page=20 ) demo.launch(debug=True)