Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
import numpy as np | |
import torch | |
from transformers import DPTFeatureExtractor, DPTForSemanticSegmentation | |
from PIL import Image | |
from torch import nn | |
import requests | |
import streamlit as st | |
img_path = None | |
st.title('Semantic Segmentation using DPT') | |
st.write('The DPT model was proposed in Vision Transformers for Dense Prediction by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. DPT is a model that leverages the Vision Transformer (ViT) as backbone for dense prediction tasks like semantic segmentation and depth estimation.') | |
file_upload = st.file_uploader('Raw Input Image') | |
image_path = st.selectbox( | |
'Choose any one image for inference', | |
('Select image', 'image1.jpg', 'image2.jpg', 'image3.jpg')) | |
if file_upload is None: | |
raw_image = image_path | |
else: | |
raw_image = file_upload | |
if raw_image != 'Select image': | |
df = pd.read_csv('class_dict_seg.csv') | |
classes = df['name'] | |
palette = df[[' r', ' g', ' b']].values | |
id2label = classes.to_dict() | |
label2id = {v: k for k, v in id2label.items()} | |
image = Image.open(raw_image) | |
image = np.asarray(image) | |
st.success("Load Image: Success") | |
with st.spinner('Loading Model...'): | |
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade") | |
model = model.to(device) | |
model.eval() | |
st.success("Load model: Success") | |
with st.spinner('Preparing image...'): | |
# prepare the image for the model (aligned resize) | |
feature_extractor_inference = DPTFeatureExtractor(do_random_crop=False, do_pad=False) | |
pixel_values = feature_extractor_inference(image, return_tensors="pt").pixel_values.to(device) | |
with st.spinner('Running inference...'): | |
outputs = model(pixel_values=pixel_values)# logits are of shape (batch_size, num_labels, height/4, width/4) | |
with st.spinner('Postprocessing...'): | |
logits = outputs.logits.cpu() | |
# First, rescale logits to original image size | |
upsampled_logits = nn.functional.interpolate(logits, | |
size=image.shape[:-1], # (height, width) | |
mode='bilinear', | |
align_corners=False) | |
# Second, apply argmax on the class dimension | |
seg = upsampled_logits.argmax(dim=1)[0] | |
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3\ | |
all_labels = [] | |
for label, color in enumerate(palette): | |
color_seg[seg == label, :] = color | |
if label in seg: | |
all_labels.append(id2label[label]) | |
# Convert to BGR | |
color_seg = color_seg[..., ::-1] | |
# Show image + mask | |
img = np.array(image) * 0.5 + color_seg * 0.5 | |
img = img.astype(np.uint8) | |
st.image(img, caption="Segmented Image") | |
st.header("Predicted Labels") | |
for idx, label in enumerate(all_labels): | |
st.subheader(f'{idx+1}) {label}') | |
st.success("Success") | |
#url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
#image = Image.open(requests.get(url, stream=True).raw) | |
#st.success("Image open: Success") | |
#feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large-ade") | |
#model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade") | |
#st.success("Load model: Success") | |
#inputs = feature_extractor(images=image, return_tensors="pt") | |
#st.success("Feature extraction: Success") | |
#outputs = model(**inputs) | |
#logits = outputs.logits | |
#st.text(str(logits)) | |
#st.success("Success") |