import os from io import BytesIO import requests from datetime import datetime import random # Interface utilities import gradio as gr # Data utilities import numpy as np import pandas as pd # Image utilities from PIL import Image import cv2 # Clip Model import torch from transformers import CLIPTokenizer, CLIPModel # Style Transfer Model import paddlehub as hub os.system("hub install stylepro_artistic==1.0.1") stylepro_artistic = hub.Module(name="stylepro_artistic") # Clip Model device = "cuda" if torch.cuda.is_available() else "cpu" model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") model = model.to(device) # Load Data photos = pd.read_csv("unsplash-dataset/photos.tsv000", sep="\t", header=0) photo_features = np.load("unsplash-dataset/features.npy") photo_ids = pd.read_csv("unsplash-dataset/photo_ids.csv") photo_ids = list(photo_ids["photo_id"]) def image_from_text(text_input): start=datetime.now() ## Inference with torch.no_grad(): inputs = tokenizer([text_input], padding=True, return_tensors="pt") text_features = model.get_text_features(**inputs).cpu().numpy() ## Find similarity similarities = list((text_features @ photo_features.T).squeeze(0)) ## Return best image :) idx = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)[0][1] photo_id = photo_ids[idx] photo_data = photos[photos["photo_id"] == photo_id].iloc[0] print(f"Time spent at CLIP: {datetime.now()-start}") start=datetime.now() # Downlaod image response = requests.get(photo_data["photo_image_url"] + "?w=640") pil_image = Image.open(BytesIO(response.content)).convert("RGB") open_cv_image = np.array(pil_image) # Convert RGB to BGR open_cv_image = open_cv_image[:, :, ::-1].copy() print(f"Time spent at Image request: {datetime.now()-start}") return open_cv_image def inference(content, style): content_image = image_from_text(content) start=datetime.now() result = stylepro_artistic.style_transfer( images=[{ "content": content_image, "styles": [cv2.imread(style.name)] }]) print(f"Time spent at Style Transfer: {datetime.now()-start}") return Image.fromarray(np.uint8(result[0]["data"])[:,:,::-1]).convert("RGB") if __name__ == "__main__": title = "Neural Style Transfer" description = "Gradio demo for Neural Style Transfer. To use it, simply enter the text for image content and upload style image. Read more at the links below." article = "

Parameter-Free Style Projection for Arbitrary Style Transfer | Github Repo
Clip paper | Hugging Face Clip Implementation

" examples=[ ["a cute kangaroo", "styles/starry.jpeg"], ["man holding beer", "styles/mona1.jpeg"], ] interface = gr.Interface(inference, inputs=[ gr.inputs.Textbox(lines=1, placeholder="Describe the content of the image", default="a cute kangaroo", label="Describe the image to which the style will be applied"), gr.inputs.Image(type="file", label="Style to be applied"), ], outputs=gr.outputs.Image(type="pil"), enable_queue=True, title=title, description=description, article=article, examples=examples) interface.launch()