File size: 19,533 Bytes
6e86f38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0f4f4
6e86f38
 
 
 
 
 
 
 
 
 
 
 
a8f5e53
6e86f38
 
 
 
 
 
 
 
 
a8f5e53
6e86f38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f6f4a0
6e86f38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8f5e53
6e86f38
 
a8f5e53
 
 
6e86f38
 
a8f5e53
6e86f38
 
 
 
 
a8f5e53
6e86f38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8f5e53
6e86f38
 
 
 
 
 
 
 
 
 
 
 
9ee1495
6e86f38
 
 
 
 
 
 
9ee1495
 
 
 
6e86f38
 
9ee1495
6e86f38
 
a8f5e53
6e86f38
 
 
 
 
 
6d77aed
cd03317
 
 
6e86f38
 
 
 
 
 
 
a8f5e53
 
 
 
8fec543
 
 
 
cd03317
6e86f38
 
 
6d77aed
6e86f38
 
 
 
 
a8f5e53
6e86f38
 
8fec543
 
 
 
 
a2b087a
8fec543
 
 
 
 
 
 
 
6e86f38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
import os
import tensorflow as tf
os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
import numpy as np
import PIL.Image
import gradio as gr
import tensorflow_hub as hub
import matplotlib.pyplot as plt

import gradio as gr
import requests
import io
import random
import os
from PIL import Image, ImageDraw, ImageFont

from datasets import load_dataset
import pandas as pd
from time import sleep
from tqdm import tqdm

import extcolors
from gradio_client import Client

import cv2
import numpy as np
import glob
import pathlib

API_TOKEN = os.environ.get("HF_READ_TOKEN")

DEFAULT_PROMPT = "X go to Istanbul"
DEFAULT_ROLE = "Superman"
DEFAULT_BOOK_COVER = "book_cover_dir/Blank.png"

hub_module = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2')

def tensor_to_image(tensor):
    tensor = tensor*255
    tensor = np.array(tensor, dtype=np.uint8)
    if np.ndim(tensor)>3:
      assert tensor.shape[0] == 1
      tensor = tensor[0]
    return PIL.Image.fromarray(tensor)


def perform_neural_transfer(content_image_input, style_image_input, hub_module = hub_module):
    content_image = content_image_input.astype(np.float32)[np.newaxis, ...] / 255.
    content_image = tf.image.resize(content_image, (400, 600))
    style_image = style_image_input.astype(np.float32)[np.newaxis, ...] / 255.
    style_image = tf.image.resize(style_image, (256, 256))
    outputs = hub_module(tf.constant(content_image), tf.constant(style_image))
    stylized_image = outputs[0]
    stylized_image = tensor_to_image(stylized_image)
    content_image_input = tensor_to_image(content_image_input)
    stylized_image = stylized_image.resize(content_image_input.size)
    return stylized_image

list_models = [
    "Pixel-Art-XL",
    "SD-1.5",
    "OpenJourney-V4",
    "Anything-V4",
    "Disney-Pixar-Cartoon",
    "Dalle-3-XL",
]


def generate_txt2img(current_model, prompt, is_negative=False, image_style="None style", steps=50, cfg_scale=7,
                     seed=None, API_TOKEN = API_TOKEN):
    if current_model == "SD-1.5":
        API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
    elif current_model == "OpenJourney-V4":
        API_URL = "https://api-inference.huggingface.co/models/prompthero/openjourney"
    elif current_model == "Anything-V4":
        API_URL = "https://api-inference.huggingface.co/models/xyn-ai/anything-v4.0"
    elif current_model == "Disney-Pixar-Cartoon":
        API_URL = "https://api-inference.huggingface.co/models/stablediffusionapi/disney-pixar-cartoon"
    elif current_model == "Pixel-Art-XL":
        API_URL = "https://api-inference.huggingface.co/models/nerijs/pixel-art-xl"
    elif current_model == "Dalle-3-XL":
        API_URL = "https://api-inference.huggingface.co/models/openskyml/dalle-3-xl"


    #API_TOKEN = os.environ.get("HF_READ_TOKEN")
    headers = {"Authorization": f"Bearer {API_TOKEN}"}

    if type(prompt) != type(""):
        prompt = DEFAULT_PROMPT

    if image_style == "None style":
        payload = {
            "inputs": prompt + ", 8k",
            "is_negative": is_negative,
            "steps": steps,
            "cfg_scale": cfg_scale,
            "seed": seed if seed is not None else random.randint(-1, 2147483647)
        }
    elif image_style == "Cinematic":
        payload = {
            "inputs": prompt + ", realistic, detailed, textured, skin, hair, eyes, by Alex Huguet, Mike Hill, Ian Spriggs, JaeCheol Park, Marek Denko",
            "is_negative": is_negative + ", abstract, cartoon, stylized",
            "steps": steps,
            "cfg_scale": cfg_scale,
            "seed": seed if seed is not None else random.randint(-1, 2147483647)
        }
    elif image_style == "Digital Art":
        payload = {
            "inputs": prompt + ", faded , vintage , nostalgic , by Jose Villa , Elizabeth Messina , Ryan Brenizer , Jonas Peterson , Jasmine Star",
            "is_negative": is_negative + ", sharp , modern , bright",
            "steps": steps,
            "cfg_scale": cfg_scale,
            "seed": seed if seed is not None else random.randint(-1, 2147483647)
        }
    elif image_style == "Portrait":
        payload = {
            "inputs": prompt + ", soft light, sharp, exposure blend, medium shot, bokeh, (hdr:1.4), high contrast, (cinematic, teal and orange:0.85), (muted colors, dim colors, soothing tones:1.3), low saturation, (hyperdetailed:1.2), (noir:0.4), (natural skin texture, hyperrealism, soft light, sharp:1.2)",
            "is_negative": is_negative,
            "steps": steps,
            "cfg_scale": cfg_scale,
            "seed": seed if seed is not None else random.randint(-1, 2147483647)
        }

    image_bytes = requests.post(API_URL, headers=headers, json=payload).content
    image = Image.open(io.BytesIO(image_bytes))
    return image

from huggingface_hub import InferenceClient
import gradio as gr
import pandas as pd
import numpy as np
import os

event_reasoning_df = pd.DataFrame(
                [['Use the following events as a background to answer questions related to the cause and effect of time.', 'Ok'],

                ['What are the necessary preconditions for the next event?:X had a big meal.', 'X placed an order'],
                ['What could happen after the next event?:X had a big meal.', 'X becomes fat'],
                ['What is the motivation for the next event?:X had a big meal.', 'X is hungry'],
                ['What are your feelings after the following event?:X had a big meal.', "X tastes good"],

                ['What are the necessary preconditions for the next event?:X met his favorite star.', 'X bought a ticket'],
                ['What could happen after the next event?:X met his favorite star.', 'X is motivated'],
                ['What is the motivation for the next event?:X met his favorite star.', 'X wants to have some entertainment'],
                ['What are your feelings after the following event?:X met his favorite star.', "X is in a happy mood"],

                ['What are the necessary preconditions for the next event?: X to cheat', 'X has evil intentions'],
                ['What could happen after the next event?:X to cheat', 'X is accused'],
                ['What is the motivation for the next event?:X to cheat', 'X wants to get something for nothing'],
                ['What are your feelings after the following event?:X to cheat', "X is starving and freezing in prison"],

                ['What could happen after the next event?:X go to Istanbul', ''],
                             ],
                             columns = ["User", "Assistant"]
                             )

Mistral_7B_client = InferenceClient(
    "mistralai/Mistral-7B-Instruct-v0.1"
)

NEED_PREFIX = 'What are the necessary preconditions for the next event?'
EFFECT_PREFIX = 'What could happen after the next event?'
INTENT_PREFIX = 'What is the motivation for the next event?'
REACT_PREFIX = 'What are your feelings after the following event?'

def format_prompt(message, history):
    prompt = "<s>"
    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response}</s> "
    prompt += f"[INST] {message} [/INST]"
    return prompt

def generate(
    prompt, history, client = Mistral_7B_client,
    temperature=0.7, max_new_tokens=256, top_p=0.95, repetition_penalty=1.1,
):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        yield output
    return output

hist = event_reasoning_df.iloc[:-1, :].apply(
    lambda x: (x["User"], x["Assistant"]), axis = 1
)

def produce_4_event(event_fact, hist = hist):
    NEED_PREFIX_prompt = "{}:{}".format(NEED_PREFIX, event_fact)
    EFFECT_PREFIX_prompt = "{}:{}".format(EFFECT_PREFIX, event_fact)
    INTENT_PREFIX_prompt = "{}:{}".format(INTENT_PREFIX, event_fact)
    REACT_PREFIX_prompt = "{}:{}".format(REACT_PREFIX, event_fact)
    NEED_PREFIX_output = list(generate(NEED_PREFIX_prompt, history = hist, max_new_tokens = 2048))[-1]
    EFFECT_PREFIX_output = list(generate(EFFECT_PREFIX_prompt, history = hist, max_new_tokens = 2048))[-1]
    INTENT_PREFIX_output = list(generate(INTENT_PREFIX_prompt, history = hist, max_new_tokens = 2048))[-1]
    REACT_PREFIX_output = list(generate(REACT_PREFIX_prompt, history = hist, max_new_tokens = 2048))[-1]
    NEED_PREFIX_output, EFFECT_PREFIX_output, INTENT_PREFIX_output, REACT_PREFIX_output = map(lambda x: x.replace("</s>", ""), [NEED_PREFIX_output, EFFECT_PREFIX_output, INTENT_PREFIX_output, REACT_PREFIX_output])
    return {
        NEED_PREFIX: NEED_PREFIX_output,
        EFFECT_PREFIX: EFFECT_PREFIX_output,
        INTENT_PREFIX: INTENT_PREFIX_output,
        REACT_PREFIX: REACT_PREFIX_output,
    }

def transform_4_event_as_sd_prompts(event_fact ,event_reasoning_dict, role_name = "superman"):
    req = {}
    for k, v in event_reasoning_dict.items():
        if type(role_name) == type("") and role_name.strip():
            v_ = v.replace("X", role_name)
        else:
            v_ = v
        req[k] = list(generate("Transform this as a prompt in stable diffusion: {}".\
        format(v_),
              history = [], max_new_tokens = 2048))[-1].replace("</s>", "")
    event_fact_ = event_fact.replace("X", role_name)
    req["EVENT_FACT"] = list(generate("Transform this as a prompt in stable diffusion: {}".\
    format(event_fact_),
          history = [], max_new_tokens = 2048))[-1].replace("</s>", "")
    req_list = [
        req[INTENT_PREFIX], req[NEED_PREFIX],
            req["EVENT_FACT"],
        req[REACT_PREFIX], req[EFFECT_PREFIX]
    ]
    caption_list = [
        event_reasoning_dict[INTENT_PREFIX], event_reasoning_dict[NEED_PREFIX],
            event_fact,
        event_reasoning_dict[REACT_PREFIX], event_reasoning_dict[EFFECT_PREFIX]
    ]
    caption_list = list(map(lambda x: x.replace("X", role_name), caption_list))
    return caption_list ,req_list

def batch_as_list(input_, batch_size = 3):
    req = []
    for ele in input_:
        if not req or len(req[-1]) >= batch_size:
            req.append([ele])
        else:
            req[-1].append(ele)
    return req

def add_margin(pil_img, top, right, bottom, left, color):
    width, height = pil_img.size
    new_width = width + right + left
    new_height = height + top + bottom
    result = Image.new(pil_img.mode, (new_width, new_height), color)
    result.paste(pil_img, (left, top))
    return result

def add_caption_on_image(input_image, caption, marg_ratio = 0.15, row_token_num = 6):
    from uuid import uuid1
    assert hasattr(input_image, "save")
    max_image_size = max(input_image.size)
    marg_size = int(marg_ratio * max_image_size)
    colors, pixel_count = extcolors.extract_from_image(input_image)
    input_image = add_margin(input_image, marg_size, 0, 0, marg_size, colors[0][0])
    font = ImageFont.truetype("DejaVuSerif-Italic.ttf" ,int(marg_size / 4))
    caption_token_list = list(map(lambda x: x.strip() ,caption.split(" ")))
    caption_list = list(map(" ".join ,batch_as_list(caption_token_list, row_token_num)))
    draw = ImageDraw.Draw(input_image)
    for line_num ,line_caption in enumerate(caption_list):
        position = (
        int(marg_size / 4) * (line_num + 1) * 1.1 ,
        (int(marg_size / 4) * (
            (line_num + 1) * 1.1
        )))
        draw.text(position, line_caption, fill="black", font = font)
    return input_image


def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height)))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width)))
        return result

def generate_video(images, video_name = 'ppt.avi'):
    import cv2
    from uuid import uuid1
    im_names = []
    for im in images:
        name = "{}.png".format(uuid1())
        im.save(name)
        im_names.append(name)
    frame = cv2.imread(im_names[0])

    # setting the frame width, height width
    # the width, height of first image
    height, width, layers = frame.shape

    video = cv2.VideoWriter(video_name, 0, 1, (width, height))

    # Appending the images to the video one by one
    for name in im_names:
        video.write(cv2.imread(name))
        os.remove(name)

    # Deallocating memories taken for window creation
    #cv2.destroyAllWindows()
    video.release()  # releasing the video generated

def make_video_from_image_list(image_list, video_name = "ppt.avi"):
    if os.path.exists(video_name):
        os.remove(video_name)
    assert all(map(lambda x: hasattr(x, "save"), image_list))
    max_size = list(map(max ,zip(*map(lambda x: x.size, image_list))))
    max_size = max(max_size)
    image_list = list(map(lambda x: expand2square(x,
                                                 extcolors.extract_from_image(x)[0][0][0]
                                                 ).resize((max_size, max_size)), image_list))

    generate_video(image_list, video_name = video_name)
    return video_name

def style_transfer_func(content_img, style_img):
    assert hasattr(content_img, "save")
    assert hasattr(style_img, "save")
    colors, pixel_count = extcolors.extract_from_image(style_img)
    if colors and colors[0][0] == (255, 255, 255) and (colors[0][1] / sum(map(lambda t2: t2[1] ,colors)) > 0.95):
        return content_img
    content_image_input = np.asarray(content_img)
    style_image_input = np.asarray(style_img)
    out = perform_neural_transfer(content_image_input, style_image_input)
    assert hasattr(out, "save")
    return out


def gen_images_from_event_fact(current_model, event_fact = DEFAULT_PROMPT, role_name = DEFAULT_ROLE,
    style_pic = None
):
    event_reasoning_dict = produce_4_event(event_fact)
    caption_list ,event_reasoning_sd_list = transform_4_event_as_sd_prompts(event_fact ,
        event_reasoning_dict,
        role_name = role_name
    )
    img_list = []
    for prompt in tqdm(event_reasoning_sd_list):
        im = generate_txt2img(current_model, prompt, is_negative=False, image_style="None style")
        img_list.append(im)
        sleep(2)
    img_list = list(filter(lambda x: hasattr(x, "save"), img_list))
    if style_pic is not None and hasattr(style_pic, "size"):
        style_pic = Image.fromarray(style_pic.astype(np.uint8))
        print("perform styling.....")
        img_list_ = []
        for x in tqdm(img_list):
            img_list_.append(style_transfer_func(x, style_pic))
        img_list = img_list_
    img_list = list(map(lambda t2: add_caption_on_image(t2[0], t2[1]) ,zip(*[img_list, caption_list])))
    img_mid = img_list[2]
    img_list_reordered = [img_mid]
    for ele in img_list:
        if ele not in img_list_reordered:
            img_list_reordered.append(ele)
    video_path = make_video_from_image_list(img_list_reordered)
    return video_path

def image_click(images, evt: gr.SelectData,
    ):
    img_selected = images[evt.index][0]["name"]
    return img_selected

def get_book_covers():
    covers = pd.Series(
    list(pathlib.Path("book_cover_dir").rglob("*.jpg")) + \
    list(pathlib.Path("book_cover_dir").rglob("*.png")) + \
    list(pathlib.Path("book_cover_dir").rglob("*.jpeg"))
    ).map(str).map(lambda x: np.nan if x.split("/")[-1].startswith("_") else x).dropna().map(
        lambda x: (x, "".join(x.split(".")[:-1]).split("/")[-1])
    ).values.tolist()
    covers = sorted(covers, key = lambda t2: int(DEFAULT_BOOK_COVER in t2[0]), reverse = True)
    return covers

with gr.Blocks(css=".caption-label {display:none}") as demo:
    favicon = '<img src="" width="48px" style="display: inline">'
    gr.Markdown(
        f"""<h1><center> 🎥💬 Comet Atomic Story Teller</center></h1>
            """
    )
    with gr.Row():
        with gr.Column(elem_id="prompt-container"):
            current_model = gr.Dropdown(label="Current Model", choices=list_models, value="Pixel-Art-XL")
            style_reference_input_gallery = gr.Gallery(get_book_covers(),
                            height = 768 + (768 - 512) + 64 + 32,
                            label = "StoryBook Cover (click to use)",
                            object_fit = "contain"
                            )
        with gr.Column(elem_id="prompt-container"):
            style_reference_input_image = gr.Image(
                            label = "StoryBook Cover (you can upload yourself or click from left gallery)",
                            value = DEFAULT_BOOK_COVER,
                            interactive = True,
                            )
            with gr.Row():
                role_name = gr.Textbox(label="Role (X)", placeholder=DEFAULT_ROLE, lines=1,
                    elem_id="prompt-text-input", value = DEFAULT_ROLE,
                    info = "You should set the Role (X) with some famous man (like: Confucius Superman)"
                    )
                text_prompt = gr.Textbox(label="Event Prompt", placeholder=DEFAULT_PROMPT,
                    lines=1, elem_id="prompt-text-input", value = DEFAULT_PROMPT,
                    info = "You should set the prompt in format 'X do something', X is the role in the left."
                    )
            with gr.Row():
                text_button = gr.Button("Generate", variant='primary', elem_id="gen-button")

            with gr.Row():
                video_output = gr.Video(label = "Story Video", elem_id="gallery", height = 768,)

    style_reference_input_gallery.select(
            image_click, style_reference_input_gallery, style_reference_input_image
    )

    text_button.click(gen_images_from_event_fact, inputs=[current_model, text_prompt, role_name, style_reference_input_image],
        outputs=video_output)

    gr.Examples(
        [
            ["Pixel-Art-XL", "book_cover_dir/Blank.png", "Confucius", "X read a book"],
            ["Pixel-Art-XL", "book_cover_dir/Blank.png", "Superman", "X go to Istanbul"],
            ["Pixel-Art-XL", "book_cover_dir/Paint me a picture of the Great Wall of China in t.jpg", "Monk Xuanzang", "X went to the West to obtain Buddhist scriptures"],
            ["SD-1.5", "book_cover_dir/zh_ancient_tree.jpg", "Monk Xuanzang", "X went to the West to obtain Buddhist scriptures"],
            ["SD-1.5", "book_cover_dir/Blank.png", "Mickey Mouse", "X attends a party"],
            ["SD-1.5", "book_cover_dir/an illustration of a city.jpg", "The Eastern Roman Emperor", "X defeats Mongol Invaders"],
            ["SD-1.5", "book_cover_dir/River.png", "Napoleon", "X riding a horse"],
            ["SD-1.5", "book_cover_dir/seaport.png", "The Pope", "X is being crowned"],
        ],
        inputs = [current_model, style_reference_input_image, role_name, text_prompt]
    )

demo.launch(show_api=False)