Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
"""## hugging face funcs""" | |
import io | |
import matplotlib.pyplot as plt | |
import requests | |
import inflect | |
from PIL import Image | |
def load_image_from_url(url): | |
return Image.open(requests.get(url, stream=True).raw) | |
def render_results_in_image(in_pil_img, in_results): | |
plt.figure(figsize=(16, 10)) | |
plt.imshow(in_pil_img) | |
ax = plt.gca() | |
for prediction in in_results: | |
x, y = prediction['box']['xmin'], prediction['box']['ymin'] | |
w = prediction['box']['xmax'] - prediction['box']['xmin'] | |
h = prediction['box']['ymax'] - prediction['box']['ymin'] | |
ax.add_patch(plt.Rectangle((x, y), | |
w, | |
h, | |
fill=False, | |
color="green", | |
linewidth=2)) | |
ax.text( | |
x, | |
y, | |
f"{prediction['label']}: {round(prediction['score']*100, 1)}%", | |
color='red' | |
) | |
plt.axis("off") | |
# Save the modified image to a BytesIO object | |
img_buf = io.BytesIO() | |
plt.savefig(img_buf, format='png', | |
bbox_inches='tight', | |
pad_inches=0) | |
img_buf.seek(0) | |
modified_image = Image.open(img_buf) | |
# Close the plot to prevent it from being displayed | |
plt.close() | |
return modified_image | |
def summarize_predictions_natural_language(predictions): | |
summary = {} | |
p = inflect.engine() | |
for prediction in predictions: | |
label = prediction['label'] | |
if label in summary: | |
summary[label] += 1 | |
else: | |
summary[label] = 1 | |
result_string = "In this image, there are " | |
for i, (label, count) in enumerate(summary.items()): | |
count_string = p.number_to_words(count) | |
result_string += f"{count_string} {label}" | |
if count > 1: | |
result_string += "s" | |
result_string += " " | |
if i == len(summary) - 2: | |
result_string += "and " | |
# Remove the trailing comma and space | |
result_string = result_string.rstrip(', ') + "." | |
return result_string | |
##### To ignore warnings ##### | |
import warnings | |
import logging | |
from transformers import logging as hf_logging | |
def ignore_warnings(): | |
# Ignore specific Python warnings | |
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint") | |
warnings.filterwarnings("ignore", message="Could not find image processor class") | |
warnings.filterwarnings("ignore", message="The `max_size` parameter is deprecated") | |
# Adjust logging for libraries using the logging module | |
logging.basicConfig(level=logging.ERROR) | |
hf_logging.set_verbosity_error() | |
######## | |
import numpy as np | |
import torch | |
import matplotlib.pyplot as plt | |
def show_mask(mask, ax, random_color=False): | |
if random_color: | |
color = np.concatenate([np.random.random(3), | |
np.array([0.6])], | |
axis=0) | |
else: | |
color = np.array([30/255, 144/255, 255/255, 0.6]) | |
h, w = mask.shape[-2:] | |
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) | |
ax.imshow(mask_image) | |
def show_box(box, ax): | |
x0, y0 = box[0], box[1] | |
w, h = box[2] - box[0], box[3] - box[1] | |
ax.add_patch(plt.Rectangle((x0, y0), | |
w, | |
h, edgecolor='green', | |
facecolor=(0,0,0,0), | |
lw=2)) | |
def show_boxes_on_image(raw_image, boxes): | |
plt.figure(figsize=(10,10)) | |
plt.imshow(raw_image) | |
for box in boxes: | |
show_box(box, plt.gca()) | |
plt.axis('on') | |
plt.show() | |
def show_points_on_image(raw_image, input_points, input_labels=None): | |
plt.figure(figsize=(10,10)) | |
plt.imshow(raw_image) | |
input_points = np.array(input_points) | |
if input_labels is None: | |
labels = np.ones_like(input_points[:, 0]) | |
else: | |
labels = np.array(input_labels) | |
show_points(input_points, labels, plt.gca()) | |
plt.axis('on') | |
plt.show() | |
def show_points_and_boxes_on_image(raw_image, | |
boxes, | |
input_points, | |
input_labels=None): | |
plt.figure(figsize=(10,10)) | |
plt.imshow(raw_image) | |
input_points = np.array(input_points) | |
if input_labels is None: | |
labels = np.ones_like(input_points[:, 0]) | |
else: | |
labels = np.array(input_labels) | |
show_points(input_points, labels, plt.gca()) | |
for box in boxes: | |
show_box(box, plt.gca()) | |
plt.axis('on') | |
plt.show() | |
def show_points_and_boxes_on_image(raw_image, | |
boxes, | |
input_points, | |
input_labels=None): | |
plt.figure(figsize=(10,10)) | |
plt.imshow(raw_image) | |
input_points = np.array(input_points) | |
if input_labels is None: | |
labels = np.ones_like(input_points[:, 0]) | |
else: | |
labels = np.array(input_labels) | |
show_points(input_points, labels, plt.gca()) | |
for box in boxes: | |
show_box(box, plt.gca()) | |
plt.axis('on') | |
plt.show() | |
def show_points(coords, labels, ax, marker_size=375): | |
pos_points = coords[labels==1] | |
neg_points = coords[labels==0] | |
ax.scatter(pos_points[:, 0], | |
pos_points[:, 1], | |
color='green', | |
marker='*', | |
s=marker_size, | |
edgecolor='white', | |
linewidth=1.25) | |
ax.scatter(neg_points[:, 0], | |
neg_points[:, 1], | |
color='red', | |
marker='*', | |
s=marker_size, | |
edgecolor='white', | |
linewidth=1.25) | |
def fig2img(fig): | |
"""Convert a Matplotlib figure to a PIL Image and return it""" | |
import io | |
buf = io.BytesIO() | |
fig.savefig(buf) | |
buf.seek(0) | |
img = Image.open(buf) | |
return img | |
def show_mask_on_image(raw_image, mask, return_image=False): | |
if not isinstance(mask, torch.Tensor): | |
mask = torch.Tensor(mask) | |
if len(mask.shape) == 4: | |
mask = mask.squeeze() | |
fig, axes = plt.subplots(1, 1, figsize=(15, 15)) | |
mask = mask.cpu().detach() | |
axes.imshow(np.array(raw_image)) | |
show_mask(mask, axes) | |
axes.axis("off") | |
plt.show() | |
if return_image: | |
fig = plt.gcf() | |
return fig2img(fig) | |
def show_pipe_masks_on_image(raw_image, outputs, return_image=False): | |
plt.imshow(np.array(raw_image)) | |
ax = plt.gca() | |
for mask in outputs["masks"]: | |
show_mask(mask, ax=ax, random_color=True) | |
plt.axis("off") | |
plt.show() | |
if return_image: | |
fig = plt.gcf() | |
return fig2img(fig) | |
"""## imports""" | |
from transformers import pipeline | |
from transformers import SamModel, SamProcessor | |
from transformers import BlipForImageTextRetrieval | |
from transformers import AutoProcessor | |
from transformers.utils import logging | |
logging.set_verbosity_error() | |
#ignore_warnings() | |
import io | |
import matplotlib.pyplot as plt | |
import requests | |
import inflect | |
from PIL import Image | |
import os | |
import gradio as gr | |
import time | |
"""# Object detection | |
## hugging face model ("facebook/detr-resnet-50"). 167MB | |
""" | |
od_pipe = pipeline("object-detection", "facebook/detr-resnet-50") | |
chosen_model = pipeline("object-detection", "hustvl/yolos-small") | |
"""## gradio funcs""" | |
def get_object_detection_prediction(model_name, raw_image): | |
model = od_pipe | |
if "chosen-model" in model_name: | |
model = chosen_model | |
start = time.time() | |
pipeline_output = model(raw_image) | |
end = time.time() | |
elapsed_result = f'{model_name} object detection elapsed {end-start} seconds' | |
print(elapsed_result) | |
processed_image = render_results_in_image(raw_image, pipeline_output) | |
return [processed_image, elapsed_result] | |
"""# Image segmentation | |
## hugging face models: Zigeng/SlimSAM-uniform-77(segmentation) 39MB, Intel/dpt-hybrid-midas(depth) 490MB | |
""" | |
hugging_face_segmentation_pipe = pipeline("mask-generation", "Zigeng/SlimSAM-uniform-77") | |
hugging_face_segmentation_model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77") | |
hugging_face_segmentation_processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-77") | |
hugging_face_depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-hybrid-midas") | |
"""## chosen models: facebook/sam-vit-base(segmentation) 375MB, LiheYoung/depth-anything-small-hf(depth) 100MB""" | |
chosen_name = "facebook/sam-vit-base" | |
chosen_segmentation_pipe = pipeline("mask-generation", chosen_name) | |
chosen_segmentation_model = SamModel.from_pretrained(chosen_name) | |
chosen_segmentation_processor = SamProcessor.from_pretrained(chosen_name) | |
chosen_depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf") | |
"""## gradio funcs""" | |
input_points = [[[1600, 700]]] | |
def segment_image_pretrained(model_name, raw_image): | |
processor = hugging_face_segmentation_processor | |
model = hugging_face_segmentation_model | |
if("chosen" in model_name): | |
processor = chosen_segmentation_processor | |
model = chosen_segmentation_model | |
start = time.time() | |
inputs = processor(raw_image, | |
input_points=input_points, | |
return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
predicted_masks = processor.image_processor.post_process_masks( | |
outputs.pred_masks, | |
inputs["original_sizes"], | |
inputs["reshaped_input_sizes"]) | |
results = [] | |
predicted_mask = predicted_masks[0] | |
end = time.time() | |
elapsed_result = f'{model_name} pretrained image segmentation elapsed {end-start} seconds' | |
print(elapsed_result) | |
for i in range(3): | |
results.append(show_mask_on_image(raw_image, predicted_mask[:, i], return_image=True)) | |
results.append(elapsed_result); | |
return results | |
def segment_image(model_name, raw_image): | |
model = hugging_face_segmentation_pipe | |
if("chosen" in model_name): | |
print("chosen model used") | |
model = chosen_segmentation_pipe | |
start = time.time() | |
output = model(raw_image, points_per_batch=32) | |
end = time.time() | |
elapsed_result = f'{model_name} raw image segmentation elapsed {end-start} seconds' | |
print(elapsed_result) | |
return [show_pipe_masks_on_image(raw_image, output, return_image = True), elapsed_result] | |
def depth_image(model_name, input_image): | |
depth_estimator = hugging_face_depth_estimator | |
print(model_name) | |
if("chosen" in model_name): | |
print("chosen model used") | |
depth_estimator = chosen_depth_estimator | |
start = time.time() | |
out = depth_estimator(input_image) | |
prediction = torch.nn.functional.interpolate( | |
out["predicted_depth"].unsqueeze(0).unsqueeze(0), | |
size=input_image.size[::-1], | |
mode="bicubic", | |
align_corners=False, | |
) | |
end = time.time() | |
elapsed_result = f'{model_name} Depth Estimation elapsed {end-start} seconds' | |
print(elapsed_result) | |
output = prediction.squeeze().numpy() | |
formatted = (output * 255 / np.max(output)).astype("uint8") | |
depth = Image.fromarray(formatted) | |
return [depth, elapsed_result] | |
"""# Image retrieval | |
## hugging face model: Salesforce/blip-itm-base-coco 900MB | |
""" | |
hugging_face_retrieval_model = BlipForImageTextRetrieval.from_pretrained( | |
"Salesforce/blip-itm-base-coco") | |
hugging_face_retrieval_processor = AutoProcessor.from_pretrained( | |
"Salesforce/blip-itm-base-coco") | |
"""## chosen model: Salesforce/blip-itm-base-flickr 900MB""" | |
chosen_retrieval_model = BlipForImageTextRetrieval.from_pretrained( | |
"Salesforce/blip-itm-base-flickr") | |
chosen_retrieval_processor = AutoProcessor.from_pretrained( | |
"Salesforce/blip-itm-base-flickr") | |
"""## gradion func""" | |
def retrieve_image(model_name, raw_image, predict_text): | |
processor = hugging_face_retrieval_processor | |
model = hugging_face_retrieval_model | |
if("chosen" in model_name): | |
processor = chosen_retrieval_processor | |
model = chosen_retrieval_model | |
start = time.time() | |
inputs = processor(images=raw_image, | |
text=predict_text, | |
return_tensors="pt") | |
end = time.time() | |
elapsed_result = f"{model_name} image retrieval elapsed {end-start} seconds" | |
print(elapsed_result) | |
itm_scores = model(**inputs)[0] | |
itm_score = torch.nn.functional.softmax(itm_scores,dim=1) | |
return [f"""\ | |
The image and text are matched \ | |
with a probability of {itm_score[0][1]:.4f}""", | |
elapsed_result] | |
"""# gradio""" | |
with gr.Blocks() as object_detection_tab: | |
gr.Markdown("# Detect objects on image") | |
gr.Markdown("Upload an image, choose model, press button.") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
input_image = gr.Image(label="Upload Image", type="pil") | |
model_selector = gr.Dropdown(["hugging-face(facebook/detr-resnet-50)", "chosen-model(hustvl/yolos-small)"], | |
label = "Select Model") | |
with gr.Column(): | |
# Output image | |
elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) | |
output_image = gr.Image(label="Output Image", type="pil") | |
# Process button | |
process_btn = gr.Button("Detect objects") | |
# Connect the input components to the processing function | |
process_btn.click( | |
fn=get_object_detection_prediction, | |
inputs=[ | |
model_selector, | |
input_image | |
], | |
outputs=[output_image, elapsed_result] | |
) | |
with gr.Blocks() as image_segmentation_detection_tab: | |
gr.Markdown("# Image segmentation") | |
gr.Markdown("Upload an image, choose model, press button.") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
input_image = gr.Image(label="Upload Image", type="pil") | |
model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"], | |
label = "Select Model") | |
with gr.Column(): | |
elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) | |
# Output image | |
output_image = gr.Image(label="Segmented image", type="pil") | |
with gr.Row(): | |
with gr.Column(): | |
segment_btn = gr.Button("Segment image(not pretrained)") | |
with gr.Row(): | |
elapsed_result_pretrained_segment = gr.Textbox(label="Seconds elapsed", lines=1) | |
with gr.Column(): | |
segment_pretrained_output_image_1 = gr.Image(label="Segmented image by pretrained model", type="pil") | |
with gr.Column(): | |
segment_pretrained_output_image_2 = gr.Image(label="Segmented image by pretrained model", type="pil") | |
with gr.Column(): | |
segment_pretrained_output_image_3 = gr.Image(label="Segmented image by pretrained model", type="pil") | |
with gr.Row(): | |
with gr.Column(): | |
segment_pretrained_model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"], | |
label = "Select Model") | |
segment_pretrained_btn = gr.Button("Segment image(pretrained)") | |
with gr.Row(): | |
with gr.Column(): | |
depth_output_image = gr.Image(label="Depth image", type="pil") | |
elapsed_result_depth = gr.Textbox(label="Seconds elapsed", lines=1) | |
with gr.Row(): | |
with gr.Column(): | |
depth_model_selector = gr.Dropdown(["hugging-face(Intel/dpt-hybrid-midas)", "chosen-model(LiheYoung/depth-anything-small-hf)"], | |
label = "Select Model") | |
depth_btn = gr.Button("Get image depth") | |
segment_btn.click( | |
fn=segment_image, | |
inputs=[ | |
model_selector, | |
input_image | |
], | |
outputs=[output_image, elapsed_result] | |
) | |
segment_pretrained_btn.click( | |
fn=segment_image_pretrained, | |
inputs=[ | |
segment_pretrained_model_selector, | |
input_image | |
], | |
outputs=[segment_pretrained_output_image_1, segment_pretrained_output_image_2, segment_pretrained_output_image_3, elapsed_result_pretrained_segment] | |
) | |
depth_btn.click( | |
fn=depth_image, | |
inputs=[ | |
depth_model_selector, | |
input_image, | |
], | |
outputs=[depth_output_image, elapsed_result_depth] | |
) | |
with gr.Blocks() as image_retrieval_tab: | |
gr.Markdown("# Check is text describes image") | |
gr.Markdown("Upload an image, choose model, press button.") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
input_image = gr.Image(label="Upload Image", type="pil") | |
text_prediction = gr.TextArea(label="Describe image") | |
model_selector = gr.Dropdown(["hugging-face(Salesforce/blip-itm-base-coco)", "chosen-model(Salesforce/blip-itm-base-flickr)"], | |
label = "Select Model") | |
with gr.Column(): | |
# Output image | |
output_result = gr.Textbox(label="Probability result", lines=3) | |
elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1) | |
# Process button | |
process_btn = gr.Button("Detect objects") | |
# Connect the input components to the processing function | |
process_btn.click( | |
fn=retrieve_image, | |
inputs=[ | |
model_selector, | |
input_image, | |
text_prediction | |
], | |
outputs=[output_result, elapsed_result] | |
) | |
with gr.Blocks() as app: | |
gr.TabbedInterface( | |
[object_detection_tab, | |
image_segmentation_detection_tab, | |
image_retrieval_tab], | |
["Object detection", | |
"Image segmentation", | |
"Retrieve image" | |
], | |
) | |
app.launch(share=True, debug=True) | |
app.close() |