import torch from transformers import AutoFeatureExtractor, AutoModelForImageClassification from einops import rearrange import gradio import call_labels # define the feature extractor extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes") # define the pretrained model model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes") # retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019) labels = call_labels.call_labels() # call model.eval() to assert that we are evaluating the model and not updating the weights model.eval() # define the function used for model inference def classify(image): # disable gradient calculation with torch.no_grad(): # extract features from the image input inputs = extractor(images=image, return_tensors='pt') # call the logits parameter only (object: SequenceClassifierOutput) outputs = model(**inputs).logits # remove the batch dimension outputs = rearrange(outputs, '1 j->j') # use the softmax function to convert the logits into probabilities outputs = torch.nn.functional.softmax(outputs) # convert the data type from tensor to a numpy array outputs = outputs.cpu().numpy() # returns a key-value pair of the id labels and its corresponding probabilities return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))} # define the gradio interface gradio.Interface(fn=classify, inputs=gradio.inputs.Image(shape=(224,224), image_mode='RGB', source='upload', tool='editor', type='pil', label=None, optional=False), outputs=gradio.outputs.Label(num_top_classes=5, type='auto'), theme='grass', examples=[['bedroom.jpg'], ['bathroom_AS.jpg'], ['samsung_room.jpg']], live=True, layout='horizontal', title='Indoor Scene Recognition', description='A smart and easy-to-use indoor scene classifier. Start by uploading an input image. The outputs are the top five indoor scene classes that best fit your input image.', interpretation='default', article='''

Additional Information

This indoor scene classifier employs the google/vit-base-patch16-224-in21k, a Visual Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224 and was first introduced in the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale by Dosovitskiy et al. The original GitHub repository of the Visual Transformer is found in this link. This Visual Transformer model was fine-tuned on the MIT Indoor Scenes from Kaggle. The source model from Hugging Face is found in this link.

''', allow_flagging='never').launch()