Spaces:

vivien
/

depth-aware-caption

Sleeping

File size: 6,874 Bytes

import numpy as np
from PIL import ImageDraw, Image, ImageFont
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
import torch
import streamlit as st

FONTS = [
    "Font: Serif - EBGaramond",
    "Font: Serif - Cinzel",
    "Font: Sans - Roboto",
    "Font: Sans - Lato",
    "Font: Display - Lobster",
    "Font: Display - LilitaOne",
    "Font: Handwriting - GreatVibes",
    "Font: Handwriting - Pacifico",
    "Font: Mono - Inconsolata",
    "Font: Mono - Cutive",
]


def hex_to_rgb(hex):
    rgb = []
    for i in (0, 2, 4):
        decimal = int(hex[i : i + 2], 16)
        rgb.append(decimal)
    return tuple(rgb)


@st.cache(allow_output_mutation=True)
def load():
    feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
    model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
    return model, feature_extractor


model, feature_extractor = load()


def compute_depth(image):
    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=image.size[::-1],
        mode="bicubic",
        align_corners=False,
    )
    return prediction.cpu().numpy()[0, 0, :, :]


def get_mask1(
    shape, x, y, caption, font=None, font_size=0.08, color=(0, 0, 0), alpha=0.8
):
    img_text = Image.new("RGBA", (shape[1], shape[0]), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img_text)
    font = ImageFont.truetype(font, int(font_size * shape[1]))
    draw.text(
        (x * shape[1], (1 - y) * shape[0]),
        caption,
        fill=(*color, int(max(min(1, alpha), 0) * 255)),
        font=font,
    )
    text = np.array(img_text)
    mask1 = np.dot(np.expand_dims(text[:, :, -1] / 255, -1), np.ones((1, 3)))
    return text[:, :, :-1], mask1


def get_mask2(depth_map, depth):
    return np.expand_dims(
        (depth_map[:, :] < depth * np.min(depth_map) + (1 - depth) * np.max(depth_map)),
        -1,
    )


def add_caption(
    img,
    caption,
    depth_map=None,
    x=0.5,
    y=0.5,
    depth=0.5,
    font_size=50,
    color=(255, 255, 255),
    font="",
    alpha=1,
):
    text, mask1 = get_mask1(
        img.shape,
        x,
        y,
        caption,
        font=font,
        font_size=font_size,
        color=color,
        alpha=alpha,
    )
    mask2 = get_mask2(depth_map, depth)
    mask = mask1 * np.dot(mask2, np.ones((1, 3)))

    return ((1 - mask) * img + mask * text).astype(np.uint8)


@st.cache(max_entries=30, show_spinner=False)
def load_img(uploaded_file):
    if uploaded_file is None:
        img = Image.open("pulp.jpg")
        default = True
    else:
        img = Image.open(uploaded_file)
        if img.size[0] > 800 or img.size[1] > 800:
            if img.size[0] < img.size[1]:
                new_size = (int(800 * img.size[0] / img.size[1]), 800)
            else:
                new_size = (800, int(800 * img.size[1] / img.size[0]))
            img = img.resize(new_size)
        default = False
    return np.array(img), compute_depth(img), default


def main():
    st.markdown(
        """
    <style>
        label{
            height: 0px !important;
            min-height: 0px !important;
            margin-bottom: 0px !important;
        }
    </style>
        """,
        unsafe_allow_html=True,
    )

    st.sidebar.markdown(
        """
    # Depth-aware text addition

    Add text ***inside*** an image!

    Upload an image, enter some text and adjust the ***depth*** where you want the text to be displayed. You can also define its location and appearance (font, color, transparency and size).

    Built with [PyTorch](https://pytorch.org/), Intel's [MiDaS model](https://pytorch.org/hub/intelisl_midas_v2/), [Streamlit](https://streamlit.io/), [pillow](https://python-pillow.org/) and inspired by the official [video](https://youtu.be/eTa1jHk1Lxc) of *Jenny of Oldstones* by Florence + the Machine 
    
    To go further:
    - [blog post](https://vivien000.github.io/blog/journal/adding-text-inside-pictures-and-videos.html)
    - [notebook](https://colab.research.google.com/github/vivien000/depth-aware_captioning/blob/master/Depth_aware_Video_Captioning.ipynb) for videos
    - [examples](https://youtu.be/RtkBplRuWhg?list=PLlPB25tBWqtVhj4Ink8hl9Evc2dlIX4Jh) of videos
    """
    )

    uploaded_file = st.file_uploader("", type=["jpg", "jpeg"])
    with st.spinner("Analyzing the image - Please wait a few seconds"):
        img, depth_map, default = load_img(uploaded_file)

    if default:
        x0, y0, alpha0, font_size0, depth0, font0 = 0.02, 0.68, 0.99, 0.07, 0.12, 4
        text0 = "Pulp Fiction"
    else:
        x0, y0, alpha0, font_size0, depth0, font0 = 0.1, 0.9, 0.8, 0.08, 0.5, 0
        text0 = "Enter your text here"

    colA, colB, colC = st.columns((13, 1, 1))

    with colA:
        text = st.text_input("", text0)

    with colB:
        st.markdown("Color:")

    with colC:
        color = st.color_picker("", value="#FFFFFF")

    col1, _, col2 = st.columns((4, 1, 4))

    with col1:
        depth = st.select_slider(
            "",
            options=[i / 100 for i in range(101)],
            value=depth0,
            format_func=lambda x: "Foreground"
            if x == 0.0
            else "Background"
            if x == 1.0
            else "",
        )
        x = st.select_slider(
            "",
            options=[i / 100 for i in range(101)],
            value=x0,
            format_func=lambda x: "Left" if x == 0.0 else "Right" if x == 1.0 else "",
        )
        y = st.select_slider(
            "",
            options=[i / 100 for i in range(101)],
            value=y0,
            format_func=lambda x: "Bottom" if x == 0.0 else "Top" if x == 1.0 else "",
        )

    with col2:
        font_size = st.select_slider(
            "",
            options=[0.04 + i / 100 for i in range(0, 17)],
            value=font_size0,
            format_func=lambda x: "Small font"
            if x == 0.04
            else "Large font"
            if x == 0.2
            else "",
        )
        alpha = st.select_slider(
            "",
            options=[i / 100 for i in range(101)],
            value=alpha0,
            format_func=lambda x: "Transparent"
            if x == 0.0
            else "Opaque"
            if x == 1.0
            else "",
        )
        font = st.selectbox("", FONTS, index=font0)

    font = f"fonts/{font[6:]}.ttf"

    captioned = add_caption(
        img,
        text,
        x=x,
        y=y,
        depth=depth,
        depth_map=depth_map,
        font=font,
        font_size=font_size,
        alpha=alpha,
        color=hex_to_rgb(color[1:]),
    )

    st.image(captioned)


if __name__ == "__main__":
    main()