import numpy as np import PIL import torch import streamlit as st import cv2 DEBUG = False if DEBUG: device = torch.device("cpu") model_name = "MiDaS_small" else: device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model_name = "DPT_Large" FONTS = [ "Font: Serif - EBGaramond", "Font: Serif - Cinzel", "Font: Sans - Roboto", "Font: Sans - Lato", "Font: Display - Lobster", "Font: Display - LilitaOne", "Font: Handwriting - GreatVibes", "Font: Handwriting - Pacifico", "Font: Mono - Inconsolata", "Font: Mono - Cutive", ] CACHE_KWARGS = { "show_spinner": False, "hash_funcs": {torch.nn.parameter.Parameter: lambda _: None}, "allow_output_mutation": True, "ttl": 900, "max_entries": 20, } def hex_to_rgb(hex): rgb = [] for i in (0, 2, 4): decimal = int(hex[i : i + 2], 16) rgb.append(decimal) return tuple(rgb) @st.cache( show_spinner=True, hash_funcs={torch.nn.parameter.Parameter: lambda _: None}, allow_output_mutation=True, ) def load(model_type): midas = torch.hub.load("intel-isl/MiDaS", model_type) midas.to(device) _ = midas.eval() midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms") if model_type == "DPT_Large" or model_type == "DPT_Hybrid": transform = midas_transforms.dpt_transform else: transform = midas_transforms.small_transform return midas, transform midas, transform = load(model_name) @st.cache(**CACHE_KWARGS) def compute_depth(img): with torch.no_grad(): prediction = midas(transform(img).to(device)) prediction = torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img.shape[:2], mode="bicubic", align_corners=False, ).squeeze() return prediction.cpu().numpy() @st.cache(**CACHE_KWARGS) def get_mask1(shape, caption, font=None, font_size=0.08, color=(0, 0, 0), alpha=0.8): img_text = PIL.Image.new("RGBA", (shape[1], shape[0]), (0, 0, 0, 0)) draw = PIL.ImageDraw.Draw(img_text) font = PIL.ImageFont.truetype(font, int(font_size * img.shape[1])) draw.text( (x * img.shape[1], (1 - y) * img.shape[0]), caption, fill=(*color, int(max(min(1, alpha), 0) * 255)), font=font, ) text = np.array(img_text) mask1 = np.dot(np.expand_dims(text[:, :, -1] / 255, -1), np.ones((1, 3))) text = text[:, :, :-1] return text, mask1 @st.cache(**CACHE_KWARGS) def get_mask2(depth_map, depth): m = np.expand_dims( (depth_map[:, :] < depth * np.min(depth_map) + (1 - depth) * np.max(depth_map)), -1, ) return np.dot(m, np.ones((1, 3))) @st.cache(**CACHE_KWARGS) def add_caption( img, caption, depth_map=None, x=0.5, y=0.5, depth=0.5, font_size=50, color=(255, 255, 255), font="", alpha=1, ): if depth_map is None: depth_map = compute_depth(img) text, mask1 = get_mask1( img.shape, caption, font=font, font_size=font_size, color=color, alpha=alpha ) mask2 = get_mask2(depth_map, depth) mask = mask1 * mask2 return ((1 - mask) * img + mask * text).astype(np.uint8) st.markdown( """ """, unsafe_allow_html=True, ) st.sidebar.markdown( """ # Depth-aware text addition Add text ***inside*** an image! Upload an image, enter some text and adjust the ***depth*** where you want the text to be displayed. You can also define its location and appearance (font, color, transparency and size). Built with [PyTorch](https://pytorch.org/), Intel's [MiDaS model](https://pytorch.org/hub/intelisl_midas_v2/), [Streamlit](https://streamlit.io/), [pillow](https://python-pillow.org/) and inspired by the official [video](https://youtu.be/eTa1jHk1Lxc) of *Jenny of Oldstones* by Florence + the Machine """ ) uploaded_file = st.file_uploader("", type=["jpg", "jpeg"]) @st.cache(**CACHE_KWARGS) def load_img(uploaded_file): if uploaded_file is None: img = np.array(PIL.Image.open("pulp.jpg")) default = True else: img = np.array(PIL.Image.open(uploaded_file)) if img.shape[0] > 800 or img.shape[1] > 800: if img.shape[0] < img.shape[1]: new_size = (800, int(800 * img.shape[0] / img.shape[1])) else: new_size = (int(800 * img.shape[1] / img.shape[0]), 800) img = cv2.resize(img, dsize=new_size, interpolation=cv2.INTER_CUBIC) default = False depth_map = compute_depth(img) return img, depth_map, default img, depth_map, default = load_img(uploaded_file) if default: x0, y0, alpha0, font_size0, depth0, font0 = 0.02, 0.68, 0.99, 0.07, 0.23, 4 text0 = "Pulp Fiction" else: x0, y0, alpha0, font_size0, depth0, font0 = 0.1, 0.9, 0.8, 0.08, 0.5, 0 text0 = "Enter your text here" colA, colB, colC = st.columns((13, 1, 1)) with colA: text = st.text_input("", text0) with colB: st.markdown("Color:") with colC: color = st.color_picker("", value="#FFFFFF") col1, _, col2 = st.columns((4, 1, 4)) with col1: depth = st.select_slider( "", options=[i / 100 for i in range(101)], value=depth0, format_func=lambda x: "Foreground" if x == 0.0 else "Background" if x == 1.0 else "", ) x = st.select_slider( "", options=[i / 100 for i in range(101)], value=x0, format_func=lambda x: "Left" if x == 0.0 else "Right" if x == 1.0 else "", ) y = st.select_slider( "", options=[i / 100 for i in range(101)], value=y0, format_func=lambda x: "Bottom" if x == 0.0 else "Top" if x == 1.0 else "", ) with col2: font_size = st.select_slider( "", options=[0.04 + i / 100 for i in range(0, 17)], value=font_size0, format_func=lambda x: "Small font" if x == 0.04 else "Large font" if x == 0.2 else "", ) alpha = st.select_slider( "", options=[i / 100 for i in range(101)], value=alpha0, format_func=lambda x: "Transparent" if x == 0.0 else "Opaque" if x == 1.0 else "", ) font = st.selectbox("", FONTS, index=font0) font = f"fonts/{font[6:]}.ttf" captioned = add_caption( img, text, depth_map=depth_map, x=x, y=y, depth=depth, font=font, font_size=font_size, alpha=alpha, color=hex_to_rgb(color[1:]), ) st.image(captioned) PIL.Image.fromarray(captioned).save("result.jpg") with open("result.jpg", "rb") as file: btn = st.download_button( label="Download image", data=file, file_name="result.jpg", mime="image/jpeg" )