Spaces:

wsntxxn
/

MM-StoryAgent

Sleeping

App Files Files Community

Xu Xuenan commited on Aug 11, 2024

Commit

a121edc

1 Parent(s): 644bfda

Initial commit

Browse files

Files changed (15) hide show

.gitignore +165 -0
app.py +258 -0
configs/mm_story_agent.yaml +75 -0
mm_story_agent/__init__.py +105 -0
mm_story_agent/modality_agents/image_agent.py +663 -0
mm_story_agent/modality_agents/llm.py +73 -0
mm_story_agent/modality_agents/music_agent.py +78 -0
mm_story_agent/modality_agents/sound_agent.py +106 -0
mm_story_agent/modality_agents/speech_agent.py +90 -0
mm_story_agent/modality_agents/story_agent.py +114 -0
mm_story_agent/prompts_en.py +277 -0
mm_story_agent/video_compose_agent.py +412 -0
nls-1.0.0-py3-none-any.whl +0 -0
policy.xml +99 -0
requirements.txt +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+generated_stories/

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from pathlib import Path
+import argparse
+import shutil
+import time
+import uuid
+import subprocess
+import gradio as gr
+import yaml
+import torch.multiprocessing as mp
+mp.set_start_method('spawn', force=True)
+from mm_story_agent import MMStoryAgent
+try:
+    result = subprocess.run(["convert", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    imagemagick_installed = True
+except FileNotFoundError:
+    imagemagick_installed = False
+if not imagemagick_installed:
+    import os
+    os.system("apt update -y")
+    os.system("apt install -y imagemagick")
+    os.system("cp policy.xml /etc/ImageMagick-6/")
+with open("configs/mm_story_agent.yaml", "r") as reader:
+    config = yaml.load(reader, Loader=yaml.FullLoader)
+default_story_setting = config["story_setting"]
+default_story_gen_config = config["story_gen_config"]
+default_slideshow_effect = config["slideshow_effect"]
+default_image_config = config["image_generation"]
+default_sound_config = config["sound_generation"]
+default_music_config = config["music_generation"]
+def set_generating_progress_text(text):
+    return gr.update(visible=True, value=f"<h3>{text} ...</h3>")
+def set_text_invisible():
+    return gr.update(visible=False)
+def deep_update(original, updates):
+    for key, value in updates.items():
+        if isinstance(value, dict):
+            original[key] = deep_update(original.get(key, {}), value)
+        else:
+            original[key] = value
+    return original
+def update_page(direction, page, story_data):
+    orig_page = page
+    if direction == 'next' and page < len(story_data) - 1:
+        page = orig_page + 1
+    elif direction == 'prev' and page > 0:
+        page = orig_page - 1
+    return page, story_data[page], story_data
+def write_story_fn(story_topic, main_role, scene,
+                   num_outline, temperature,
+                   current_page,
+                   progress=gr.Progress(track_tqdm=True)):
+    config["story_dir"] = f"generated_stories/{time.strftime('%Y%m%d-%H%M%S') + '-' + str(uuid.uuid1().hex)}"
+    deep_update(config, {
+        "story_setting": {
+            "story_topic": story_topic,
+            "main_role": main_role,
+            "scene": scene,
+        },
+        "story_gen_config": {
+            "num_outline": num_outline,
+            "temperature": temperature
+        },
+    })
+    story_gen_agent = MMStoryAgent()
+    pages = story_gen_agent.write_story(config)
+    # story_data, story_accordion, story_content
+    return pages, gr.update(visible=True), pages[current_page], gr.update()
+def modality_assets_generation_fn(
+        height, width, image_seed, sound_guidance_scale, sound_seed,
+        n_candidate_per_text, music_duration,
+        story_data):
+    deep_update(config, {
+        "image_generation": {
+            "obj_cfg": {
+                "height": height,
+                "width": width,
+            },
+            "call_cfg": {
+                "seed": image_seed
+            }
+        },
+        "sound_generation": {
+            "call_cfg": {
+                "guidance_scale": sound_guidance_scale,
+                "seed": sound_seed,
+                "n_candidate_per_text": n_candidate_per_text
+            }
+        },
+        "music_generation": {
+            "call_cfg": {
+                "duration": music_duration
+            }
+        }
+    })
+    story_gen_agent = MMStoryAgent()
+    images = story_gen_agent.generate_modality_assets(config, story_data)
+    # image gallery
+    return gr.update(visible=True, value=images, columns=[len(images)], rows=[1], height="auto")
+def compose_storytelling_video_fn(
+        fade_duration, slide_duration, zoom_speed, move_ratio,
+        sound_volume, music_volume, bg_speech_ratio, fps,
+        story_data,
+        progress=gr.Progress(track_tqdm=True)):
+    deep_update(config, {
+        "slideshow_effect": {
+            "fade_duration": fade_duration,
+            "slide_duration": slide_duration,
+            "zoom_speed": zoom_speed,
+            "move_ratio": move_ratio,
+            "sound_volume": sound_volume,
+            "music_volume": music_volume,
+            "bg_speech_ratio": bg_speech_ratio,
+            "fps": fps
+        },
+    })
+    story_gen_agent = MMStoryAgent()
+    story_gen_agent.compose_storytelling_video(config, story_data)
+    # video_output
+    return Path(config["story_dir"]) / "output.mp4"
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <h1 style="text-align: center;">MM-StoryAgent</h1>
+    <p style="font-size: 16px;">This is a demo for generating attractive storytelling videos based on the given story setting.</p>
+    """)
+    with gr.Row():
+        with gr.Column():
+            story_topic = gr.Textbox(label="Story Topic", value=default_story_setting["story_topic"])
+            main_role = gr.Textbox(label="Main Role", value=default_story_setting["main_role"])
+            scene = gr.Textbox(label="Scene", value=default_story_setting["scene"])
+            chapter_num = gr.Number(label="Chapter Number", value=default_story_gen_config["num_outline"])
+            temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Temperature", value=default_story_gen_config["temperature"])
+            with gr.Accordion("Detailed Image Configuration (Optional)", open=False):
+                height = gr.Slider(label="Height", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['height'])
+                width = gr.Slider(label="Width", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['width'])
+                image_seed = gr.Number(label="Image Seed", value=default_image_config["call_cfg"]['seed'])
+            with gr.Accordion("Detailed Sound Configuration (Optional)", open=False):
+                sound_guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=7.0, step=0.5, value=default_sound_config["call_cfg"]['guidance_scale'])
+                sound_seed = gr.Number(label="Sound Seed", value=default_sound_config["call_cfg"]['seed'])
+                n_candidate_per_text = gr.Slider(label="Number of Candidates per Text", minimum=0, maximum=5, step=1, value=default_sound_config["call_cfg"]['n_candidate_per_text'])
+            with gr.Accordion("Detailed Music Configuration (Optional)", open=False):
+                music_duration = gr.Number(label="Music Duration", min_width=30.0, maximum=120.0, value=default_music_config["call_cfg"]["duration"])
+            with gr.Accordion("Detailed Slideshow Effect (Optional)", open=False):
+                fade_duration = gr.Slider(label="Fade Duration", minimum=0.1, maximum=1.5, step=0.1, value=default_slideshow_effect['fade_duration'])
+                slide_duration = gr.Slider(label="Slide Duration", minimum=0.1, maximum=1.0, step=0.1, value=default_slideshow_effect['slide_duration'])
+                zoom_speed = gr.Slider(label="Zoom Speed", minimum=0.1, maximum=2.0, step=0.1, value=default_slideshow_effect['zoom_speed'])
+                move_ratio = gr.Slider(label="Move Ratio", minimum=0.8, maximum=1.0, step=0.05, value=default_slideshow_effect['move_ratio'])
+                sound_volume = gr.Slider(label="Sound Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['sound_volume'])
+                music_volume = gr.Slider(label="Music Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['music_volume'])
+                bg_speech_ratio = gr.Slider(label="Background / Speech Ratio", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['bg_speech_ratio'])
+                fps = gr.Slider(label="FPS", minimum=1, maximum=30, step=1, value=default_slideshow_effect['fps'])
+        with gr.Column():
+            story_data = gr.State([])
+            story_generation_information = gr.Markdown(
+                label="Story Generation Status",
+                value="<h3>Generating Story Script ......</h3>",
+                visible=False)
+            with gr.Accordion(label="Story Content", open=False, visible=False) as story_accordion:
+                with gr.Row():
+                    prev_button = gr.Button("Previous Page",)
+                    next_button = gr.Button("Next Page",)
+                story_content = gr.Textbox(label="Page Content")
+            video_generation_information = gr.Markdown(label="Generation Status", value="<h3>Generating Video ......</h3>", visible=False)
+            image_gallery = gr.Gallery(label="Images", show_label=False, visible=False)
+            video_generation_btn = gr.Button("Generate Video")
+            video_output = gr.Video(label="Generated Story", interactive=False)
+    current_page = gr.State(0)
+    prev_button.click(
+        fn=update_page,
+        inputs=[gr.State("prev"), current_page, story_data],
+        outputs=[current_page, story_content]
+    )
+    next_button.click(
+        fn=update_page,
+        inputs=[gr.State("next"), current_page, story_data],
+        outputs=[current_page, story_content,])
+    # (possibly) update role description and scripts
+    video_generation_btn.click(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generating Story")],
+        outputs=video_generation_information
+    ).then(
+        fn=write_story_fn,
+        inputs=[story_topic, main_role, scene,
+                chapter_num, temperature,
+                current_page],
+        outputs=[story_data, story_accordion, story_content, video_output]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generating Modality Assets")],
+        outputs=video_generation_information
+    ).then(
+        fn=modality_assets_generation_fn,
+        inputs=[height, width, image_seed, sound_guidance_scale, sound_seed,
+                n_candidate_per_text, music_duration,
+                story_data],
+        outputs=[image_gallery]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Composing Video")],
+        outputs=video_generation_information
+    ).then(
+        fn=compose_storytelling_video_fn,
+        inputs=[fade_duration, slide_duration, zoom_speed, move_ratio,
+                sound_volume, music_volume, bg_speech_ratio, fps,
+                story_data],
+        outputs=[video_output]
+    ).then(
+        fn=lambda : gr.update(visible=False),
+        inputs=[],
+        outputs=[image_gallery]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generation Finished")],
+        outputs=video_generation_information
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", default=False, action="store_true")
+    args = parser.parse_args()
+    demo.launch(share=args.share)

configs/mm_story_agent.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+story_dir: generated_stories/20240808_1130
+audio_sample_rate: &audio_sample_rate 16000
+audio_codec: mp3 # [mp3, aac, ...]
+story_setting:
+    story_topic: "Time Management: A child learning how to manage their time effectively."
+    main_role: "(no main role specified)"
+    scene: "(no scene specified)"
+story_gen_config:
+    max_conv_turns: 3
+    num_outline: 4
+    temperature: 0.5
+caption_config:
+    font: resources/font/msyh.ttf
+    # bg_color: LightGrey
+    fontsize: 32
+    color: white
+    # stroke_color: white
+    # stroke_width: 0.5
+max_single_caption_length: 50
+sound_generation:
+    call_cfg:
+        guidance_scale: 3.5
+        seed: 0
+        ddim_steps: 200
+        n_candidate_per_text: 3
+    revise_cfg:
+        num_turns: 3
+    sample_rate: *audio_sample_rate
+speech_generation:
+    call_cfg:
+        voice: longyuan
+        sample_rate: *audio_sample_rate
+image_generation:
+    revise_cfg:
+        num_turns: 3
+    obj_cfg:
+        model_name: stabilityai/stable-diffusion-xl-base-1.0
+        id_length: 2
+        height: 512
+        width: 1024
+    call_cfg:
+        seed: 112536
+        guidance_scale: 10.0
+        style_name: "Storybook" # ['(No style)', 'Japanese Anime', 'Digital/Oil Painting', 'Pixar/Disney Character',
+                                        #  'Photographic', 'Comic book', 'Line art', 'Black and White Film Noir', 'Isometric Rooms']
+music_generation:
+    revise_cfg:
+        num_turns: 3
+    call_cfg:
+        duration: 60.0
+slideshow_effect:
+    fade_duration: 0.8
+    slide_duration: 0.4
+    zoom_speed: 0.5
+    move_ratio: 0.9
+    sound_volume: 0.6
+    music_volume: 0.5
+    bg_speech_ratio: 0.6
+    fps: 8

mm_story_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import time
+import json
+from pathlib import Path
+import torch.multiprocessing as mp
+from mm_story_agent.modality_agents.story_agent import QAOutlineStoryWriter
+from mm_story_agent.modality_agents.speech_agent import CosyVoiceAgent
+from mm_story_agent.modality_agents.sound_agent import AudioLDM2Agent
+from mm_story_agent.modality_agents.music_agent import MusicGenAgent
+from mm_story_agent.modality_agents.image_agent import StoryDiffusionAgent
+from mm_story_agent.video_compose_agent import VideoComposeAgent
+class MMStoryAgent:
+    def __init__(self) -> None:
+        self.modalities = ["image", "sound", "speech", "music"]
+        self.modality_agent_class = {
+            "image": StoryDiffusionAgent,
+            "sound": AudioLDM2Agent,
+            "speech": CosyVoiceAgent,
+            "music": MusicGenAgent
+        }
+        self.agents = {}
+    def call_modality_agent(self, agent, pages, save_path, return_dict):
+        result = agent.call(pages, save_path)
+        modality = result["modality"]
+        return_dict[modality] = result
+    def write_story(self, config):
+        story_writer = QAOutlineStoryWriter(config["story_gen_config"])
+        pages = story_writer.call(config["story_setting"])
+        # pages = [
+        #     "In the heart of a dense forest, Flicker the Fox, nestled in his cozy den, stumbled upon an ancient computer hidden beneath a pile of soft moss and forgotten treasures. Surrounded by maps of unexplored territories and codes scribbled on parchment, Flicker's eyes widened with intrigue as he traced his paw over the mysterious machine.",
+        #     "Flicker's den was a testament to his adventurous spirit, a haven filled with artifacts from his previous quests. The discovery of the computer, however, sparked a new kind of excitement within him, a curiosity that went beyond the physical boundaries of his forest home.",
+        #     "With a determined gleam in his eye, Flicker trotted out of his den in search of his parents. He had questions about this relic that couldn't wait, eager to understand the secrets it held and how it functioned in a world so different from his own.",
+        #     "Excited by his parents' encouragement, Flicker eagerly started his journey into the world of typing. His paws clumsily hit the wrong keys at first, resulting in a string of random letters and numbers on the screen. But with every mistake, Flicker's determination grew stronger.",
+        #     "Days turned into weeks, and Flicker's persistence paid off. His paws now moved gracefully across the keyboard, his eyes focused on the screen as he typed out simple messages and commands. The once foreign device was becoming a familiar tool, and Flicker felt a sense of accomplishment wash over him.",
+        #     "One evening, as the moon illuminated the forest, a wise old owl named Ollie perched on a branch outside Flicker's den. With a hoot and a smile, Ollie shared the magic of keyboard shortcuts, turning Flicker's typing sessions into thrilling adventures. Each shortcut was like a secret code, and Flicker couldn't wait to master them all.",
+        #     "Eager to explore beyond the basics, Flicker's curiosity led him to the vast digital world of the internet. With guidance from his parents and Ollie, he learned how to navigate safely, discovering interactive games and educational videos that opened his eyes to the wonders beyond his forest.",
+        #     "Each day, Flicker would sit before the screen, his paws dancing over the keys as he clicked through virtual tours of distant lands, watched videos of creatures he'd never seen, and played games that taught him about science and history. The computer became a window to a world far larger than he could have imagined.",
+        # ]
+        return pages
+    def generate_modality_assets(self, config, pages):
+        script_data = {"pages": [{"story": page} for page in pages]}
+        story_dir = Path(config["story_dir"])
+        for sub_dir in self.modalities:
+            (story_dir / sub_dir).mkdir(exist_ok=True, parents=True)
+        agents = {}
+        for modality in self.modalities:
+            agents[modality] = self.modality_agent_class[modality](config[modality + "_generation"])
+        processes = []
+        return_dict = mp.Manager().dict()
+        for modality in self.modalities:
+            p = mp.Process(target=self.call_modality_agent, args=(agents[modality], pages, story_dir / modality, return_dict))
+            processes.append(p)
+            p.start()
+        for p in processes:
+            p.join()
+        for modality, result in return_dict.items():
+            try:
+                if result["modality"] == "image":
+                    images = result["generation_results"]
+                    for idx in range(len(pages)):
+                        script_data["pages"][idx]["image_prompt"] = result["prompts"][idx]
+                elif result["modality"] == "sound":
+                    for idx in range(len(pages)):
+                        script_data["pages"][idx]["sound_prompt"] = result["prompts"][idx]
+                elif result["modality"] == "music":
+                    script_data["music_prompt"] = result["prompt"]
+            except Exception as e:
+                print(f"Error occurred during generation: {e}")
+        with open(story_dir / "script_data.json", "w") as writer:
+            json.dump(script_data, writer, ensure_ascii=False, indent=4)
+        return images
+    def compose_storytelling_video(self, config, pages):
+        video_compose_agent = VideoComposeAgent()
+        video_compose_agent.call(pages, config)
+    def call(self, config):
+        pages = self.write_story(config)
+        images = self.generate_modality_assets(config, pages)
+        self.compose_storytelling_video(config, pages)
+if __name__ == "__main__":
+    from arg_parser import parse_yaml_and_cmd
+    config = parse_yaml_and_cmd()
+    mm_story_agent = MMStoryAgent()
+    mm_story_agent.call(config)

mm_story_agent/modality_agents/image_agent.py ADDED Viewed

	@@ -0,0 +1,663 @@

+from typing import List
+import json
+import os
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers import StableDiffusionXLPipeline, DDIMScheduler
+from mm_story_agent.modality_agents.llm import QwenAgent
+from mm_story_agent.prompts_en import role_extract_system, role_review_system, \
+    story_to_image_reviser_system, story_to_image_review_system
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+class AttnProcessor(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def cal_attn_mask_xl(total_length,
+                     id_length,
+                     sa32,
+                     sa64,
+                     height,
+                     width,
+                     device="cuda",
+                     dtype=torch.float16):
+    nums_1024 = (height // 32) * (width // 32)
+    nums_4096 = (height // 16) * (width // 16)
+    bool_matrix1024 = torch.rand((1, total_length * nums_1024),device = device,dtype = dtype) < sa32
+    bool_matrix4096 = torch.rand((1, total_length * nums_4096),device = device,dtype = dtype) < sa64
+    bool_matrix1024 = bool_matrix1024.repeat(total_length,1)
+    bool_matrix4096 = bool_matrix4096.repeat(total_length,1)
+    for i in range(total_length):
+        bool_matrix1024[i:i+1,id_length*nums_1024:] = False
+        bool_matrix4096[i:i+1,id_length*nums_4096:] = False
+        bool_matrix1024[i:i+1,i*nums_1024:(i+1)*nums_1024] = True
+        bool_matrix4096[i:i+1,i*nums_4096:(i+1)*nums_4096] = True
+    mask1024 = bool_matrix1024.unsqueeze(1).repeat(1,nums_1024,1).reshape(-1,total_length * nums_1024)
+    mask4096 = bool_matrix4096.unsqueeze(1).repeat(1,nums_4096,1).reshape(-1,total_length * nums_4096)
+    return mask1024, mask4096
+class SpatialAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+    def __init__(self,
+                 global_attn_args,
+                 hidden_size=None,
+                 cross_attention_dim=None,
+                 id_length=4,
+                 device="cuda",
+                 dtype=torch.float16,
+                 height=1280,
+                 width=720,
+                 sa32=0.5,
+                 sa64=0.5,
+                 ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.device = device
+        self.dtype = dtype
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.total_length = id_length + 1
+        self.id_length = id_length
+        self.id_bank = {}
+        self.height = height
+        self.width = width
+        self.sa32 = sa32
+        self.sa64 = sa64
+        self.write = True
+        self.global_attn_args = global_attn_args
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None
+    ):
+        total_count = self.global_attn_args["total_count"]
+        attn_count = self.global_attn_args["attn_count"]
+        cur_step = self.global_attn_args["cur_step"]
+        mask1024 = self.global_attn_args["mask1024"]
+        mask4096 = self.global_attn_args["mask4096"]
+        if self.write:
+            self.id_bank[cur_step] = [hidden_states[:self.id_length], hidden_states[self.id_length:]]
+        else:
+            encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),
+                                               hidden_states[:1],
+                                               self.id_bank[cur_step][1].to(self.device), hidden_states[1:]))
+        # skip in early step
+        if cur_step < 5:
+            hidden_states = self.__call2__(attn, hidden_states, encoder_hidden_states, attention_mask, temb)
+        else:   # 256 1024 4096
+            random_number = random.random()
+            if cur_step < 20:
+                rand_num = 0.3
+            else:
+                rand_num = 0.1
+            if random_number > rand_num:
+                if not self.write:
+                    if hidden_states.shape[1] == (self.height // 32) * (self.width // 32):
+                        attention_mask = mask1024[mask1024.shape[0] // self.total_length * self.id_length:]
+                    else:
+                        attention_mask = mask4096[mask4096.shape[0] // self.total_length * self.id_length:]
+                else:
+                    if hidden_states.shape[1] == (self.height // 32) * (self.width // 32):
+                        attention_mask = mask1024[:mask1024.shape[0] // self.total_length * self.id_length,
+                                                  :mask1024.shape[0] // self.total_length * self.id_length]
+                    else:
+                        attention_mask = mask4096[:mask4096.shape[0] // self.total_length * self.id_length,
+                                                  :mask4096.shape[0] // self.total_length * self.id_length]
+                hidden_states = self.__call1__(attn, hidden_states, encoder_hidden_states, attention_mask, temb)
+            else:
+                hidden_states = self.__call2__(attn, hidden_states, None, attention_mask, temb)
+        attn_count += 1
+        if attn_count == total_count:
+            attn_count = 0
+            cur_step += 1
+            mask1024, mask4096 = cal_attn_mask_xl(self.total_length,
+                                                  self.id_length,
+                                                  self.sa32,
+                                                  self.sa64,
+                                                  self.height,
+                                                  self.width,
+                                                  device=self.device,
+                                                  dtype=self.dtype)
+            self.global_attn_args["mask1024"] = mask1024
+            self.global_attn_args["mask4096"] = mask4096
+        self.global_attn_args["attn_count"] = attn_count
+        self.global_attn_args["cur_step"] = cur_step
+        return hidden_states
+    def __call1__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            total_batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(total_batch_size, channel, height * width).transpose(1, 2)
+        total_batch_size, nums_token, channel = hidden_states.shape
+        img_nums = total_batch_size // 2
+        hidden_states = hidden_states.view(-1, img_nums, nums_token, channel).reshape(-1, img_nums * nums_token, channel)
+        batch_size, sequence_length, _ = hidden_states.shape
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states  # B, N, C
+        else:
+            encoder_hidden_states = encoder_hidden_states.view(-1, self.id_length + 1, nums_token, channel).reshape(
+                -1, (self.id_length + 1) * nums_token, channel)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(total_batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(total_batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        # print(hidden_states.shape)
+        return hidden_states
+    def __call2__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, channel = (
+            hidden_states.shape
+        )
+        # print(hidden_states.shape)
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states  # B, N, C
+        else:
+            encoder_hidden_states = encoder_hidden_states.view(-1, self.id_length + 1, sequence_length, channel).reshape(
+                -1, (self.id_length + 1) * sequence_length, channel)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class StoryDiffusionSynthesizer:
+    def __init__(self,
+                 num_pages: int,
+                 height: int,
+                 width: int,
+                 model_name: str = "stabilityai/stable-diffusion-xl-base-1.0",
+                 model_path: str = None,
+                 id_length: int = 4,
+                 num_steps: int = 50):
+        self.attn_args = {
+            "attn_count": 0,
+            "cur_step": 0,
+            "total_count": 0,
+        }
+        self.sa32 = 0.5
+        self.sa64 = 0.5
+        self.id_length = id_length
+        self.total_length = num_pages
+        self.height = height
+        self.width = width
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = torch.float16
+        self.num_steps = num_steps
+        self.styles = {
+            '(No style)': (
+                '{prompt}',
+                ''),
+            'Japanese Anime': (
+                'anime artwork illustrating {prompt}. created by japanese anime studio. highly emotional. best quality, high resolution, (Anime Style, Manga Style:1.3), Low detail, sketch, concept art, line art, webtoon, manhua, hand drawn, defined lines, simple shades, minimalistic, High contrast, Linear compositions, Scalable artwork, Digital art, High Contrast Shadows',
+                'lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Digital/Oil Painting': (
+                '{prompt} . (Extremely Detailed Oil Painting:1.2), glow effects, godrays, Hand drawn, render, 8k, octane render, cinema 4d, blender, dark, atmospheric 4k ultra detailed, cinematic sensual, Sharp focus, humorous illustration, big depth of field',
+                'anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Pixar/Disney Character': (
+                'Create a Disney Pixar 3D style illustration on {prompt} . The scene is vibrant, motivational, filled with vivid colors and a sense of wonder.',
+                'lowres, bad anatomy, bad hands, text, bad eyes, bad arms, bad legs, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, blurry, grayscale, noisy, sloppy, messy, grainy, highly detailed, ultra textured, photo'),
+            'Photographic': (
+                'cinematic photo {prompt} . Hyperrealistic, Hyperdetailed, detailed skin, matte skin, soft lighting, realistic, best quality, ultra realistic, 8k, golden ratio, Intricate, High Detail, film photography, soft focus',
+                'drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Comic book': (
+                'comic {prompt} . graphic illustration, comic art, graphic novel art, vibrant, highly detailed',
+                'photograph, deformed, glitch, noisy, realistic, stock photo, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Line art': (
+                'line art drawing {prompt} . professional, sleek, modern, minimalist, graphic, line art, vector graphics',
+                'anime, photorealistic, 35mm film, deformed, glitch, blurry, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, mutated, realism, realistic, impressionism, expressionism, oil, acrylic, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Black and White Film Noir': (
+                '{prompt} . (b&w, Monochromatic, Film Photography:1.3), film noir, analog style, soft lighting, subsurface scattering, realistic, heavy shadow, masterpiece, best quality, ultra realistic, 8k',
+                'anime, photorealistic, 35mm film, deformed, glitch, blurry, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, mutated, realism, realistic, impressionism, expressionism, oil, acrylic, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Isometric Rooms': (
+                'Tiny cute isometric {prompt} . in a cutaway box, soft smooth lighting, soft colors, 100mm lens, 3d blender render',
+                'anime, photorealistic, 35mm film, deformed, glitch, blurry, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, mutated, realism, realistic, impressionism, expressionism, oil, acrylic, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry'),
+            'Storybook': (
+                "Cartoon style, cute illustration of {prompt}.",
+                'realism, photo, realistic, lowres, bad hands, bad eyes, bad arms, bad legs, error, missing fingers, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, grayscale, noisy, sloppy, messy, grainy, ultra textured'
+            )
+        }
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            model_path if model_path is not None else model_name,
+            torch_dtype=torch.float16,
+            use_safetensors=True
+        )
+        pipe = pipe.to(self.device)
+        # pipe.id_encoder.to(self.device)
+        pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.scheduler.set_timesteps(num_steps)
+        unet = pipe.unet
+        attn_procs = {}
+        ### Insert PairedAttention
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None and (name.startswith("up_blocks") ) :
+                attn_procs[name] = SpatialAttnProcessor2_0(
+                    id_length=self.id_length,
+                    device=self.device,
+                    height=self.height,
+                    width=self.width,
+                    sa32=self.sa32,
+                    sa64=self.sa64,
+                    global_attn_args=self.attn_args
+                )
+                self.attn_args["total_count"] += 1
+            else:
+                attn_procs[name] = AttnProcessor()
+        print("successsfully load consistent self-attention")
+        print(f"number of the processor : {self.attn_args['total_count']}")
+        # unet.set_attn_processor(copy.deepcopy(attn_procs))
+        unet.set_attn_processor(attn_procs)
+        mask1024, mask4096 = cal_attn_mask_xl(
+            self.total_length,
+            self.id_length,
+            self.sa32,
+            self.sa64,
+            self.height,
+            self.width,
+            device=self.device,
+            dtype=torch.float16,
+        )
+        self.attn_args.update({
+            "mask1024": mask1024,
+            "mask4096": mask4096
+        })
+        self.pipe = pipe
+        self.negative_prompt = "naked, deformed, bad anatomy, disfigured, poorly drawn face, mutation," \
+                               "extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating" \
+                               "limbs, disconnected limbs, blurry, watermarks, oversaturated, distorted hands, amputation"
+    def set_attn_write(self,
+                       value: bool):
+        unet = self.pipe.unet
+        for name, processor in unet.attn_processors.items():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if cross_attention_dim is None:
+                if name.startswith("up_blocks") :
+                    assert isinstance(processor, SpatialAttnProcessor2_0)
+                    processor.write = value
+    def apply_style(self, style_name: str, positives: list, negative: str = ""):
+        p, n = self.styles.get(style_name, self.styles["(No style)"])
+        return [p.replace("{prompt}", positive) for positive in positives], n + ' ' + negative
+    def apply_style_positive(self, style_name: str, positive: str):
+        p, n = self.styles.get(style_name, self.styles["(No style)"])
+        return p.replace("{prompt}", positive)
+    def call(self,
+             prompts: List[str],
+             input_id_images = None,
+             start_merge_step = None,
+             style_name: str = "Pixar/Disney Character",
+             guidance_scale: float = 5.0,
+             seed: int = 2047):
+        assert len(prompts) == self.total_length, "The number of prompts should be equal to the number of pages."
+        setup_seed(seed)
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        torch.cuda.empty_cache()
+        id_prompts = prompts[:self.id_length]
+        real_prompts = prompts[self.id_length:]
+        self.set_attn_write(True)
+        self.attn_args.update({
+            "cur_step": 0,
+            "attn_count": 0
+        })
+        id_prompts, negative_prompt = self.apply_style(style_name, id_prompts, self.negative_prompt)
+        id_images = self.pipe(
+            id_prompts,
+            input_id_images=input_id_images,
+            start_merge_step=start_merge_step,
+            num_inference_steps=self.num_steps,
+            guidance_scale=guidance_scale,
+            height=self.height,
+            width=self.width,
+            negative_prompt=negative_prompt,
+            generator=generator).images
+        self.set_attn_write(False)
+        real_images = []
+        for real_prompt in real_prompts:
+            self.attn_args["cur_step"] = 0
+            real_prompt = self.apply_style_positive(style_name, real_prompt)
+            real_images.append(self.pipe(
+                real_prompt,
+                num_inference_steps=self.num_steps,
+                guidance_scale=guidance_scale,
+                height=self.height,
+                width=self.width,
+                negative_prompt=negative_prompt,
+                generator=generator).images[0]
+            )
+        images = id_images + real_images
+        return images
+class StoryDiffusionAgent:
+    def __init__(self, config, llm_type="qwen2") -> None:
+        self.config = config
+        if llm_type == "qwen2":
+            self.LLM = QwenAgent
+    def call(self, pages: List, save_path: str):
+        role_dict = self.extract_role_from_story(pages, **self.config["revise_cfg"])
+        image_prompts = self.generate_image_prompt_from_story(pages, **self.config["revise_cfg"])
+        image_prompts_with_role_desc = []
+        for image_prompt in image_prompts:
+            for role, role_desc in role_dict.items():
+                if role in image_prompt:
+                    image_prompt = image_prompt.replace(role, role_desc)
+            image_prompts_with_role_desc.append(image_prompt)
+        generation_agent = StoryDiffusionSynthesizer(
+            num_pages=len(pages),
+            **self.config["obj_cfg"]
+        )
+        images = generation_agent.call(
+            image_prompts_with_role_desc,
+            **self.config["call_cfg"]
+        )
+        for idx, image in enumerate(images):
+            image.save(save_path / f"p{idx + 1}.png")
+        return {
+            "prompts": image_prompts_with_role_desc,
+            "modality": "image",
+            "generation_results": images,
+        }
+    def extract_role_from_story(
+            self,
+            pages: List,
+            num_turns: int = 3
+        ):
+        role_extractor = self.LLM(role_extract_system, track_history=False)
+        role_reviewer = self.LLM(role_review_system, track_history=False)
+        roles = {}
+        review = ""
+        for turn in range(num_turns):
+            roles, success = role_extractor.run(json.dumps({
+                    "story_content": pages,
+                    "previous_result": roles,
+                    "improvement_suggestions": review,
+                }, ensure_ascii=False
+            ))
+            roles = json.loads(roles.strip("```json").strip("```"))
+            review, success = role_reviewer.run(json.dumps({
+                "story_content": pages,
+                "role_descriptions": roles
+            }, ensure_ascii=False))
+            if review == "Check passed.":
+                break
+        return roles
+    def generate_image_prompt_from_story(
+            self,
+            pages: List,
+            num_turns: int = 3
+        ):
+        image_prompt_rewriter = self.LLM(story_to_image_reviser_system, track_history=False)
+        image_prompt_reviewer = self.LLM(story_to_image_review_system, track_history=False)
+        image_prompts = []
+        for page in pages:
+            review = ""
+            image_prompt = ""
+            for turn in range(num_turns):
+                image_prompt, success = image_prompt_rewriter.run(json.dumps({
+                    "all_pages": pages,
+                    "current_page": page,
+                    "previous_result": image_prompt,
+                    "improvement_suggestions": review,
+                }, ensure_ascii=False))
+                if image_prompt.startswith("Image description:"):
+                    image_prompt = image_prompt[len("Image description:"):]
+                review, success = image_prompt_reviewer.run(json.dumps({
+                    "all_pages": pages,
+                    "current_page": page,
+                    "image_description": image_prompt
+                }, ensure_ascii=False))
+                if review == "Check passed.":
+                    break
+            image_prompts.append(image_prompt)
+        return image_prompts

mm_story_agent/modality_agents/llm.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Callable
+import os
+from dashscope import Generation
+class QwenAgent(object):
+    def __init__(self,
+                 system_prompt: str = None,
+                 track_history: bool = True):
+        self.system_prompt = system_prompt
+        if system_prompt is None:
+            self.history = []
+        else:
+            self.history = [
+                {"role": "system", "content": system_prompt}
+            ]
+        self.track_history = track_history
+    def basic_success_check(self, response):
+        if not response or not response.output or not response.output.text:
+            print(response)
+            return False
+        else:
+            return True
+    def run(self,
+            prompt: str,
+            top_p: float = 0.95,
+            temperature: float = 1.0,
+            seed: int = 1,
+            max_length: int = 1024,
+            max_try: int = 5,
+            success_check_fn: Callable = None
+            ):
+        self.history.append({
+            "role": "user",
+            "content": prompt
+        })
+        success = False
+        try_times = 0
+        while try_times < max_try:
+            response = Generation.call(
+                model="qwen2-72b-instruct",
+                messages=self.history,
+                top_p=top_p,
+                temperature=temperature,
+                api_key=os.environ.get('DASHSCOPE_API_KEY'),
+                seed=seed,
+                max_length=max_length
+            )
+            if success_check_fn is None:
+                success_check_fn = lambda x: True
+            if self.basic_success_check(response) and success_check_fn(response.output.text):
+                response = response.output.text
+                self.history.append({
+                    "role": "assistant",
+                    "content": response
+                })
+                success = True
+                break
+            else:
+                try_times += 1
+        if not self.track_history:
+            if self.system_prompt is not None:
+                self.history = self.history[:1]
+            else:
+                self.history = []
+        return response, success

mm_story_agent/modality_agents/music_agent.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from pathlib import Path
+import json
+from typing import List, Union
+import torchaudio
+from audiocraft.models import MusicGen
+from audiocraft.data.audio import audio_write
+from mm_story_agent.modality_agents.llm import QwenAgent
+from mm_story_agent.prompts_en import story_to_music_reviser_system, story_to_music_reviewer_system
+class MusicGenSynthesizer:
+    def __init__(self,
+                 model_name: str = 'facebook/musicgen-medium',
+                 sample_rate: int = 16000,
+                 ) -> None:
+        self.model = MusicGen.get_pretrained(model_name)
+        self.sample_rate = sample_rate
+    def call(self,
+             prompt: Union[str, List[str]],
+             save_path: Union[str, Path],
+             duration: float = 60.0,
+             ):
+        self.model.set_generation_params(duration=duration)
+        wav = self.model.generate([prompt], progress=True)[0].cpu()
+        wav = torchaudio.functional.resample(wav, self.model.sample_rate, self.sample_rate)
+        save_path = Path(save_path).parent / Path(save_path).stem
+        audio_write(save_path, wav, self.sample_rate)
+class MusicGenAgent:
+    def __init__(self, config, llm_type="qwen2") -> None:
+        self.config = config
+        if llm_type == "qwen2":
+            self.LLM = QwenAgent
+    def generate_music_prompt_from_story(
+            self,
+            pages: List,
+            num_turns: int = 3
+        ):
+        music_prompt_reviser = self.LLM(story_to_music_reviser_system, track_history=False)
+        music_prompt_reviewer = self.LLM(story_to_music_reviewer_system, track_history=False)
+        music_prompt = ""
+        review = ""
+        for turn in range(num_turns):
+            music_prompt, success = music_prompt_reviser.run(json.dumps({
+                "story": pages,
+                "previous_result": music_prompt,
+                "improvement_suggestions": review,
+            }, ensure_ascii=False))
+            review, success = music_prompt_reviewer.run(json.dumps({
+                "story_content": pages,
+                "music_description": music_prompt
+            }, ensure_ascii=False))
+            if review == "Check passed.":
+                break
+        return music_prompt
+    def call(self, pages: List, save_path: str):
+        save_path = Path(save_path)
+        music_prompt = self.generate_music_prompt_from_story(pages, **self.config["revise_cfg"])
+        generation_agent = MusicGenSynthesizer()
+        generation_agent.call(
+            prompt=music_prompt,
+            save_path=save_path / "music.wav",
+            **self.config["call_cfg"]
+        )
+        return {
+            "prompt": music_prompt,
+            "modality": "music"
+        }

mm_story_agent/modality_agents/sound_agent.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pathlib import Path
+from typing import List
+import json
+import torch
+import soundfile as sf
+from diffusers import AudioLDM2Pipeline
+from mm_story_agent.prompts_en import story_to_sound_reviser_system, story_to_sound_review_system
+from mm_story_agent.modality_agents.llm import QwenAgent
+class AudioLDM2Synthesizer:
+    def __init__(self,
+                 model_path: str = None,
+                 ) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.pipe = AudioLDM2Pipeline.from_pretrained(
+            model_path if model_path is not None else "cvssp/audioldm2",
+            torch_dtype=torch.float16
+        ).to(self.device)
+    def call(self,
+             prompts: List[str],
+             n_candidate_per_text: int = 3,
+             seed: int = 0,
+             guidance_scale: float = 3.5,
+             ddim_steps: int = 100,
+    ):
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        audios = self.pipe(
+            prompts,
+            num_inference_steps=ddim_steps,
+            audio_length_in_s=10.0,
+            guidance_scale=guidance_scale,
+            generator=generator,
+            num_waveforms_per_prompt=n_candidate_per_text).audios
+        audios = audios[::n_candidate_per_text]
+        return audios
+class AudioLDM2Agent:
+    def __init__(self, config, llm_type="qwen2") -> None:
+        self.config = config
+        if llm_type == "qwen2":
+            self.LLM = QwenAgent
+    def call(self, pages: List, save_path: str):
+        sound_prompts = self.generate_sound_prompt_from_story(pages, **self.config["revise_cfg"])
+        save_paths = []
+        forward_prompts = []
+        save_path = Path(save_path)
+        for idx in range(len(pages)):
+            if sound_prompts[idx] != "No sounds.":
+                save_paths.append(save_path / f"p{idx + 1}.wav")
+                forward_prompts.append(sound_prompts[idx])
+        generation_agent = AudioLDM2Synthesizer()
+        if len(forward_prompts) > 0:
+            sounds = generation_agent.call(
+                forward_prompts,
+                **self.config["call_cfg"]
+            )
+            for sound, path in zip(sounds, save_paths):
+                sf.write(path.__str__(), sound, self.config["sample_rate"])
+        return {
+            "prompts": sound_prompts,
+            "modality": "sound"
+        }
+    def generate_sound_prompt_from_story(
+            self,
+            pages: List,
+            num_turns: int = 3
+        ):
+        sound_prompt_reviser = self.LLM(story_to_sound_reviser_system, track_history=False)
+        sound_prompt_reviewer = self.LLM(story_to_sound_review_system, track_history=False)
+        sound_prompts = []
+        for page in pages:
+            review = ""
+            sound_prompt = ""
+            for turn in range(num_turns):
+                sound_prompt, success = sound_prompt_reviser.run(json.dumps({
+                    "story": page,
+                    "previous_result": sound_prompt,
+                    "improvement_suggestions": review,
+                }, ensure_ascii=False))
+                if sound_prompt.startswith("Sound description:"):
+                    sound_prompt = sound_prompt[len("Sound description:"):]
+                review, success = sound_prompt_reviewer.run(json.dumps({
+                    "story": page,
+                    "sound_description": sound_prompt
+                }, ensure_ascii=False))
+                if review == "Check passed.":
+                    break
+                # else:
+                    # print(review)
+            sound_prompts.append(sound_prompt)
+        return sound_prompts

mm_story_agent/modality_agents/speech_agent.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import json
+from pathlib import Path
+from typing import List
+from aliyunsdkcore.client import AcsClient
+from aliyunsdkcore.request import CommonRequest
+import nls
+class CosyVoiceSynthesizer:
+    def __init__(self) -> None:
+        self.access_key_id = os.environ.get('ALIYUN_ACCESS_KEY_ID')
+        self.access_key_secret = os.environ.get('ALIYUN_ACCESS_KEY_SECRET')
+        self.app_key = os.environ.get('ALIYUN_APP_KEY')
+        self.setup_token()
+    def setup_token(self):
+        client = AcsClient(self.access_key_id, self.access_key_secret,
+                           'cn-shanghai')
+        request = CommonRequest()
+        request.set_method('POST')
+        request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
+        request.set_version('2019-02-28')
+        request.set_action_name('CreateToken')
+        try:
+            response = client.do_action_with_exception(request)
+            jss = json.loads(response)
+            if 'Token' in jss and 'Id' in jss['Token']:
+                token = jss['Token']['Id']
+                self.token = token
+        except Exception as e:
+            import traceback
+            raise RuntimeError(
+                f'Request token failed with error: {e}, with detail {traceback.format_exc()}'
+            )
+    def call(self, save_file, transcript, voice="longyuan", sample_rate=16000):
+        writer = open(save_file, "wb")
+        return_data = b''
+        def write_data(data, *args):
+            nonlocal return_data
+            return_data += data
+            if writer is not None:
+                writer.write(data)
+        def raise_error(error, *args):
+            raise RuntimeError(
+                f'Synthesizing speech failed with error: {error}')
+        def close_file(*args):
+            if writer is not None:
+                writer.close()
+        sdk = nls.NlsStreamInputTtsSynthesizer(
+            url='wss://nls-gateway-cn-beijing.aliyuncs.com/ws/v1',
+            token=self.token,
+            appkey=self.app_key,
+            on_data=write_data,
+            on_error=raise_error,
+            on_close=close_file,
+        )
+        sdk.startStreamInputTts(voice=voice, sample_rate=sample_rate, aformat='wav')
+        sdk.sendStreamInputTts(transcript,)
+        sdk.stopStreamInputTts()
+class CosyVoiceAgent:
+    def __init__(self, config) -> None:
+        self.config = config
+    def call(self, pages: List, save_path: str):
+        save_path = Path(save_path)
+        generation_agent = CosyVoiceSynthesizer()
+        for idx, page in enumerate(pages):
+            generation_agent.call(
+                save_file=save_path / f"p{idx + 1}.wav",
+                transcript=page,
+                **self.config["call_cfg"]
+            )
+        return {
+            "modality": "speech"
+        }

mm_story_agent/modality_agents/story_agent.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import json
+import random
+from tqdm import trange, tqdm
+from mm_story_agent.modality_agents.llm import QwenAgent
+from mm_story_agent.prompts_en import question_asker_system, expert_system, \
+    dlg_based_writer_system, dlg_based_writer_prompt, chapter_writer_system
+def parse_list(output):
+    try:
+        pages = eval(output)
+        return True
+    except Exception:
+        return False
+def json_parse_outline(outline):
+    outline = outline.strip("```json").strip("```")
+    try:
+        outline = json.loads(outline)
+        if not isinstance(outline, dict):
+            return False
+        if outline.keys() != {"story_title", "story_outline"}:
+            return False
+        for chapter in outline["story_outline"]:
+            if chapter.keys() != {"chapter_title", "chapter_summary"}:
+                return False
+    except json.decoder.JSONDecodeError:
+        return False
+    return True
+class QAOutlineStoryWriter:
+    def __init__(self,
+                 story_gen_config,
+                 llm_type: str = "qwen2"):
+        if llm_type == "qwen2":
+            self.LLM = QwenAgent
+        self.story_gen_config = story_gen_config
+    def generate_outline(self, story_setting):
+        temperature = self.story_gen_config["temperature"]
+        max_conv_turns = self.story_gen_config["max_conv_turns"]
+        num_outline = self.story_gen_config["num_outline"]
+        asker = self.LLM(question_asker_system, track_history=False)
+        expert = self.LLM(expert_system, track_history=False)
+        dialogue = []
+        for turn in trange(max_conv_turns):
+            dialogue_history = "\n".join(dialogue)
+            question, success = asker.run(f"Story setting: {story_setting}\nDialogue history: \n{dialogue_history}\n", temperature=temperature)
+            question = question.strip()
+            if question == "Thank you for your help!":
+                break
+            dialogue.append(f"You: {question}")
+            answer, success = expert.run(f"Story setting: {story_setting}\nQuestion: \n{question}\nAnswer: ", temperature=temperature)
+            answer = answer.strip()
+            dialogue.append(f"Expert: {answer}")
+        # print("\n".join(dialogue))
+        writer = self.LLM(dlg_based_writer_system, track_history=False)
+        writer_prompt = dlg_based_writer_prompt.format(
+            story_setting=story_setting,
+            dialogue_history="\n".join(dialogue),
+            num_outline=num_outline
+        )
+        outline, success = writer.run(writer_prompt, success_check_fn=json_parse_outline)
+        outline = json.loads(outline)
+        # print(outline)
+        return outline
+    def generate_story_from_outline(self, outline):
+        temperature = self.story_gen_config["temperature"]
+        chapter_writer = self.LLM(chapter_writer_system, track_history=False)
+        all_pages = []
+        for idx, chapter in enumerate(tqdm(outline["story_outline"])):
+            chapter_detail, success = chapter_writer.run(
+                json.dumps(
+                    {
+                        "completed_story": all_pages,
+                        "current_chapter": chapter
+                    },
+                    ensure_ascii=False
+                ),
+                success_check_fn=parse_list,
+                temperature=temperature
+            )
+            while success is False:
+                chapter_detail, success = chapter_writer.run(
+                    json.dumps(
+                        {
+                            "completed_story": all_pages,
+                            "current_chapter": chapter
+                        },
+                        ensure_ascii=False
+                    ),
+                    seed=random.randint(0, 100000),
+                    temperature=temperature,
+                    success_check_fn=parse_list
+                )
+            pages = [page.strip() for page in eval(chapter_detail)]
+            all_pages.extend(pages)
+        # print(all_pages)
+        return all_pages
+    def call(self, story_setting):
+        outline = self.generate_outline(story_setting)
+        pages = self.generate_story_from_outline(outline)
+        return pages

mm_story_agent/prompts_en.py ADDED Viewed

	@@ -0,0 +1,277 @@

+instruction = """
+1. Conciseness: Describe the plot of each chapter in a simple and straightforward manner, using a storybook tone without excessive details.
+2. Narrative Style: There is no need for dialogue or interaction with the reader.
+3. Coherent Plot: The story should have a coherent plot, with connections and reflections throughout. All chapters should contribute to the same overarching story, rather than being independent little tales.
+4. Reasonableness: The plot should make sense, avoiding logical errors and unreasonable elements.
+5. Educational Value: A good bedtime story should have educational significance, helping children learn proper values and behaviors.
+6. Warm and Pleasant: The story should evoke a sense of ease, warmth, and joy, making children feel loved and cared for.
+""".strip()
+question_asker_system = """
+## Basic requirements for children stories:
+1. Storytelling Style: No need for dialogue or interaction with the reader.
+2. Coherent Plot: The story plot should be coherent and consistent throughout.
+3. Logical Consistency: The plot must be logical, without any logical errors or unreasonable elements.
+4. Educational Significance: An excellent bedtime story should convey certain educational values, helping children learn proper values and behaviors.
+5. Warm and Pleasant: The story should ideally evoke a feeling of lightness, warmth, and happiness, making children feel loved and cared for.
+## Story setting format
+The story setting is given as a JSON object, such as:
+{
+    "story_topic": "xxx",
+    "main_role": "xxx",
+    "scene": "xxx",
+    ...
+}
+You are a student learning to write children stories, discussing writing ideas with an expert.
+Please ask the expert questions to discuss the information needed for writing a story following the given setting.
+If you have no more questions, say "Thank you for your help!" to end the conversation.
+Ask only one question at a time and avoid repeating previously asked questions. Your questions should relate to the given setting, such as the story topic.
+""".strip()
+expert_system = """
+## Basic requirements for children stories:
+1. Storytelling Style: No need for dialogue or interaction with the reader.
+2. Coherent Plot: The story plot should be coherent and consistent throughout.
+3. Logical Consistency: The plot must be logical, without any logical errors or unreasonable elements.
+4. Educational Significance: An excellent bedtime story should convey certain educational values, helping children learn proper values and behaviors.
+5. Warm and Pleasant: The story should ideally evoke a feeling of lightness, warmth, and happiness, making children feel loved and cared for.
+## Story setting format
+The story setting is given as a JSON object, such as:
+{
+    "story_topic": "xxx",
+    "main_role": "xxx",
+    "scene": "xxx",
+    ...
+}
+You are an expert in children story writing. You are discussing creative ideas with a student learning to write children stories. Please provide meaningful responses to the student's questions.
+""".strip()
+dlg_based_writer_system = """
+Based on a dialogue, write an outline for a children storybook. This dialogue provides some points and ideas for writing the outline.
+When writing the outline, basic requirements should be met:
+{instruction}
+## Output Format
+Output a valid JSON object, following the format:
+{{
+    "story_title": "xxx",
+    "story_outline": [{{"chapter_title":"xxx", "chapter_summary": "xxx"}}, {{"chapter_title":"xxx", "chapter_summary": "xxx"}}],
+}}
+""".strip().format(instruction=instruction)
+dlg_based_writer_prompt = """
+Story setting: {story_setting}
+Dialogue history:
+{dialogue_history}
+Write a story outline with {num_outline} chapters.
+""".strip()
+chapter_writer_system = """
+Based on the story outline, expand the given chapter summary into detailed story content.
+## Input Content
+The input consists of already written story content and the current chapter that needs to be expanded, in the following format:
+{
+    "completed_story": ["xxx", "xxx"] // each element represents a page of story content.
+    "current_chapter": {"chapter_title": "xxx", "chapter_summary": "xxx"}
+}
+## Output Content
+Output the expanded story content for the current chapter. The result should be a list where each element corresponds to the plot of one page of the storybook.
+## Notes
+1. Only expand the current chapter; do not overwrite content from other chapters.
+2. The expanded content should not be too lengthy, with a maximum of 3 pages and no more than 2 sentences per page.
+3. Maintain the tone of the story; do not add extra annotations, explanations, settings, or comments.
+4. If the story is already complete, no further writing is necessary.
+""".strip()
+role_extract_system = """
+Extract all main role names from the given story content and generate corresponding role descriptions. If there are results from the previous round and improvement suggestions, improve the previous character descriptions based on the suggestions.
+## Steps
+1. First, identify the main role's name in the story.
+2. Then, identify other frequently occurring roles.
+3. Generate descriptions for these roles. Ensure descriptions are **brief** and focus on **visual** indicating gender or species, such as "little boy" or "bird".
+4. Ensure that descriptions do not exceed 20 words.
+## Input Format
+The input consists of the story content and possibly the previous output results with corresponding improvement suggestions, formatted as:
+{
+    "story_content": "xxx",
+    "previous_result": {
+        "(role 1's name)": "xxx",
+        "(role 2's name)": "xxx"
+    }, // Empty indicates the first round
+    "improvement_suggestions": "xxx" // Empty indicates the first round
+}
+## Output Format
+Output the character names and descriptions following this format:
+{
+    "(role 1's name)": "xxx",
+    "(role 2's name)": "xxx"
+}
+Strictly follow the above steps and directly output the results without any additional content.
+""".strip()
+role_review_system = """
+Review the role descriptions corresponding to the given story. If requirements are met, output "Check passed.". If not, provide improvement suggestions.
+## Requirements for Role Descriptions
+1. Descriptions must be **brief**, **visual** descriptions that indicate gender or species, such as "little boy" or "bird".
+2. Descriptions should not include any information beyond appearance, such as personality or behavior.
+3. The description of each role must not exceed 20 words.
+## Input Format
+The input consists of the story content and role extraction results, with a format of:
+{
+    "story_content": "xxx",
+    "role_descriptions": {
+        "(Character 1's Name)": "xxx",
+        "(Character 2's Name)": "xxx"
+    }
+}
+## Output Format
+Directly output improvement suggestions without any additional content if requirements are not met. Otherwise, output "Check passed."
+""".strip()
+story_to_image_reviser_system = """
+Convert the given story content into image description. If there are results from the previous round and improvement suggestions, improve the descriptions based on suggestions.
+## Input Format
+The input consists of all story pages, the current page, and possibly the previous output results with corresponding improvement suggestions, formatted as:
+{
+    "all_pages": ["xxx", "xxx"], // Each element is a page of story content
+    "current_page": "xxx",
+    "previous_result": "xxx", // If empty, indicates the first round
+    "improvement_suggestions": "xxx" // If empty, indicates the first round
+}
+## Output Format
+Output a string describing the image corresponding to the current story content without any additional content.
+## Notes
+1. Keep it concise. Focus on the main visual elements, omit details.
+2. Retain visual elements. Only describe static scenes, avoid the plot details.
+3. Remove non-visual elements. Typical non-visual elements include dialogue, thoughts, and plot.
+4. Retain role names.
+""".strip()
+story_to_image_review_system = """
+Review the image description corresponding to the given story content. If the requirements are met, output "Check passed.". If not, provide improvement suggestions.
+## Requirements for Image Descriptions
+1. Keep it concise. Focus on the main visual elements, omit details.
+2. Retain visual elements. Only describe static scenes, avoid the plot details.
+3. Remove non-visual elements. Typical non-visual elements include dialogue, thoughts, and plot.
+4. Retain role names.
+## Input Format
+The input consists of all story content, the current story content, and the corresponding image description, structured as:
+{
+    "all_pages": ["xxx", "xxx"],
+    "current_page": "xxx",
+    "image_description": "xxx"
+}
+## Output Format
+Directly output improvement suggestions without any additional content if requirements are not met. Otherwise, output "Check passed."
+""".strip()
+story_to_sound_reviser_system = """
+Extract possible sound effects from the given story content. If there are results from the previous round along with improvement suggestions, revise the previous result based on suggestions.
+## Input Format
+The input consists of the story content, and may also include the previous result and corresponding improvement suggestions, formatted as:
+{
+    "story": "xxx",
+    "previous_result": "xxx", // empty indicates the first round
+    "improvement_suggestions": "xxx" // empty indicates the first round
+}
+## Output Format
+Output a string describing the sound effects without any additional content.
+## Notes
+1. The description must be sounds. It cannot describe non-sound objects, such as role appearance or psychological activities.
+2. The number of sound effects must not exceed 3.
+3. Exclude speech.
+4. Exclude musical and instrumental sounds, such as background music.
+5. Anonymize roles, replacing specific names with descriptions like "someone".
+6. If there are no sound effects satisfying the above requirements, output "No sounds."
+""".strip()
+story_to_sound_review_system = """
+Review sound effects corresponding to the given story content. If the requirements are met, output "Check passed.". If not, provide improvement suggestions.
+## Requirements for Sound Descriptions
+1. The description must be sounds. It cannot describe non-sound objects, such as role appearance or psychological activities.
+2. The number of sounds must not exceed 3.
+3. No speech should be included.
+4. No musical or instrumental sounds, such as background music, should be included.
+5. Roles must be anonymized. Role names should be replaced by descriptions like "someone".
+6. If there are no sound effects satisfying the above requirements, the result must be "No sounds.".
+## Input Format
+The input consists of the story content and the corresponding sound description, formatted as:
+{
+    "story": "xxx",
+    "sound_description": "xxx"
+}
+## Output Format
+Directly output improvement suggestions without any additional content if requirements are not met. Otherwise, output "Check passed."
+""".strip()
+story_to_music_reviser_system = """
+Generate suitable background music descriptions based on the story content. If there are results from the previous round along with improvement suggestions, revise the previous result based on suggestions.
+## Input Format
+The input consists of the story content, and may also include the previous result and corresponding improvement suggestions, formatted as:
+{
+    "story": ["xxx", "xxx"], // Each element is a page of story content
+    "previous_result": "xxx", // empty indicates the first round
+    "improvement_suggestions": "xxx" // empty indicates the first round
+}
+## Output Format
+Output a string describing the background music without any additional content.
+## Notes
+1. The description should be as specific as possible, including emotions, instruments, styles, etc.
+2. Do not include specific role names.
+""".strip()
+story_to_music_reviewer_system = """
+Review the background music description corresponding to the story content to check whether the description is suitable. If suitable, output "Check passed.". If not, provide improvement suggestions.
+## Requirements for Background Music Descriptions
+1. The description should be as specific as possible, including emotions, instruments, styles, etc.
+2. Do not include specific role names.
+## Input Format
+The input consists of the story content and the corresponding music description, structured as:
+{
+    "story": ["xxx", "xxx"], // Each element is a page of story content
+    "music_description": "xxx"
+}
+## Output Format
+Directly output improvement suggestions without any additional content if requirements are not met. Otherwise, output "Check passed.".
+""".strip()

mm_story_agent/video_compose_agent.py ADDED Viewed

	@@ -0,0 +1,412 @@

+from pathlib import Path
+from typing import List, Union
+import random
+import re
+from datetime import timedelta
+from tqdm import trange
+import numpy as np
+import librosa
+import cv2
+from zhon.hanzi import punctuation as zh_punc
+from moviepy.editor import ImageClip, AudioFileClip, CompositeAudioClip, \
+    CompositeVideoClip, ColorClip, VideoFileClip, VideoClip, TextClip, concatenate_audioclips
+import moviepy.video.compositing.transitions as transfx
+from moviepy.audio.AudioClip import AudioArrayClip
+from moviepy.audio.fx.all import audio_loop
+from moviepy.video.tools.subtitles import SubtitlesClip
+def generate_srt(timestamps: List,
+                 captions: List,
+                 save_path: Union[str, Path],
+                 max_single_length: int = 30):
+    def format_time(seconds: float) -> str:
+        td = timedelta(seconds=seconds)
+        total_seconds = int(td.total_seconds())
+        millis = int((td.total_seconds() - total_seconds) * 1000)
+        hours, remainder = divmod(total_seconds, 3600)
+        minutes, seconds = divmod(remainder, 60)
+        return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
+    srt_content = []
+    num_caps = len(timestamps)
+    for idx in range(num_caps):
+        start_time, end_time = timestamps[idx]
+        caption_chunks = split_caption(captions[idx], max_single_length).split("\n")
+        num_chunks = len(caption_chunks)
+        if num_chunks == 0:
+            continue
+        segment_duration = (end_time - start_time) / num_chunks
+        for chunk_idx, chunk in enumerate(caption_chunks):
+            chunk_start_time = start_time + segment_duration * chunk_idx
+            chunk_end_time = start_time + segment_duration * (chunk_idx + 1)
+            start_time_str = format_time(chunk_start_time)
+            end_time_str = format_time(chunk_end_time)
+            srt_content.append(f"{len(srt_content) // 2 + 1}\n{start_time_str} --> {end_time_str}\n{chunk}\n\n")
+    with open(save_path, 'w') as srt_file:
+        srt_file.writelines(srt_content)
+def add_caption(captions: List,
+                srt_path: Union[str, Path],
+                timestamps: List,
+                video_clip: VideoClip,
+                max_single_length: int = 30,
+                **caption_config):
+    generate_srt(timestamps, captions, srt_path, max_single_length)
+    generator = lambda txt: TextClip(txt, **caption_config)
+    subtitles = SubtitlesClip(srt_path.__str__(), generator)
+    captioned_clip = CompositeVideoClip([video_clip,
+                                         subtitles.set_position(("center", "bottom"), relative=True)])
+    return captioned_clip
+def split_keep_separator(text, separator):
+    pattern = f'([{re.escape(separator)}])'
+    pieces = re.split(pattern, text)
+    return pieces
+def split_caption(caption, max_length=30):
+    lines = []
+    if ord(caption[0]) >= ord("a") and ord(caption[0]) <= ord("z") or ord(caption[0]) >= ord("A") and ord(caption[0]) <= ord("Z"):
+        words = caption.split(" ")
+        current_words = []
+        for word in words:
+            if len(" ".join(current_words + [word])) <= max_length:
+                current_words += [word]
+            else:
+                if current_words:
+                    lines.append(" ".join(current_words))
+                    current_words = []
+        if current_words:
+            lines.append(" ".join(current_words))
+    else:
+        sentences = split_keep_separator(caption, zh_punc)
+        current_line = ""
+        for sentence in sentences:
+            if len(current_line + sentence) <= max_length:
+                current_line += sentence
+            else:
+                if current_line:
+                    lines.append(current_line)
+                    current_line = ""
+                if sentence.startswith(tuple(zh_punc)):
+                    if lines:
+                        lines[-1] += sentence[0]
+                    current_line = sentence[1:]
+                else:
+                    current_line = sentence
+        if current_line:
+            lines.append(current_line.strip())
+    return '\n'.join(lines)
+def add_bottom_black_area(clip: VideoFileClip,
+                          black_area_height: int = 64):
+    """
+    Add a black area at the bottom of the video clip (for captions).
+    Args:
+        clip (VideoFileClip): Video clip to be processed.
+        black_area_height (int): Height of the black area.
+    Returns:
+        VideoFileClip: Processed video clip.
+    """
+    black_bar = ColorClip(size=(clip.w, black_area_height), color=(0, 0, 0), duration=clip.duration)
+    extended_clip = CompositeVideoClip([clip, black_bar.set_position(("center", "bottom"))])
+    return extended_clip
+def add_zoom_effect(clip, speed=1.0, mode='in', position='center'):
+    fps = clip.fps
+    duration = clip.duration
+    total_frames = int(duration * fps)
+    def main(getframe, t):
+        frame = getframe(t)
+        h, w = frame.shape[: 2]
+        i = t * fps
+        if mode == 'out':
+            i = total_frames - i
+        zoom = 1 + (i * ((0.1 * speed) / total_frames))
+        positions = {'center':  [(w - (w * zoom)) / 2,  (h - (h  *  zoom)) / 2],
+                     'left': [0, (h - (h * zoom)) / 2],
+                     'right': [(w - (w * zoom)), (h - (h * zoom)) / 2],
+                     'top': [(w - (w * zoom)) / 2, 0],
+                     'topleft': [0, 0],
+                     'topright': [(w - (w * zoom)), 0],
+                     'bottom': [(w - (w * zoom)) / 2, (h - (h * zoom))],
+                     'bottomleft': [0, (h - (h * zoom))],
+                     'bottomright': [(w - (w * zoom)), (h - (h * zoom))]}
+        tx, ty = positions[position]
+        M = np.array([[zoom, 0, tx], [0, zoom, ty]])
+        frame = cv2.warpAffine(frame, M, (w, h))
+        return frame
+    return clip.fl(main)
+def add_move_effect(clip, direction="left", move_raito=0.95):
+    orig_width = clip.size[0]
+    orig_height = clip.size[1]
+    new_width = int(orig_width / move_raito)
+    new_height = int(orig_height / move_raito)
+    clip = clip.resize(width=new_width, height=new_height)
+    if direction == "left":
+        start_position = (0, 0)
+        end_position = (orig_width - new_width, 0)
+    elif direction == "right":
+        start_position = (orig_width - new_width, 0)
+        end_position = (0, 0)
+    duration = clip.duration
+    moving_clip = clip.set_position(
+        lambda t: (start_position[0] + (
+            end_position[0] - start_position[0]) / duration * t, start_position[1])
+    )
+    final_clip = CompositeVideoClip([moving_clip], size=(orig_width, orig_height))
+    return final_clip
+def add_slide_effect(clips, slide_duration):
+    ####### CAUTION: requires at least `slide_duration` of silence at the end of each clip #######
+    durations = [clip.duration for clip in clips]
+    first_clip = CompositeVideoClip(
+        [clips[0].fx(transfx.slide_out, duration=slide_duration, side="left")]
+    ).set_start(0)
+    slide_out_sides = ["left"]
+    videos = [first_clip]
+    out_to_in_mapping = {"left": "right", "right": "left"}
+    for idx, clip in enumerate(clips[1: -1], start=1):
+        # For all other clips in the middle, we need them to slide in to the previous clip and out for the next one
+        # determine `slide_in_side` according to the `slide_out_side` of the previous clip
+        slide_in_side = out_to_in_mapping[slide_out_sides[-1]]
+        slide_out_side = "left" if random.random() <= 0.5 else "right"
+        slide_out_sides.append(slide_out_side)
+        videos.append(
+            (
+                CompositeVideoClip(
+                    [clip.fx(transfx.slide_in, duration=slide_duration, side=slide_in_side)]
+                )
+                .set_start(sum(durations[:idx]) - (slide_duration) * idx)
+                .fx(transfx.slide_out, duration=slide_duration, side=slide_out_side)
+            )
+        )
+    last_clip = CompositeVideoClip(
+        [clips[-1].fx(transfx.slide_in, duration=slide_duration, side=out_to_in_mapping[slide_out_sides[-1]])]
+    ).set_start(sum(durations[:-1]) - slide_duration * (len(clips) - 1))
+    videos.append(last_clip)
+    video = CompositeVideoClip(videos)
+    return video
+def compose_video(story_dir: Union[str, Path],
+                  save_path: Union[str, Path],
+                  captions: List,
+                  music_path: Union[str, Path],
+                  num_pages: int,
+                  fps: int = 10,
+                  audio_sample_rate: int = 16000,
+                  audio_codec: str = "mp3",
+                  caption_config: dict = {},
+                  max_single_caption_length: int = 30,
+                  fade_duration: float = 1.0,
+                  slide_duration: float = 0.4,
+                  zoom_speed: float = 0.5,
+                  move_ratio: float = 0.95,
+                  sound_volume: float = 0.2,
+                  music_volume: float = 0.2,
+                  bg_speech_ratio: float = 0.4):
+    if not isinstance(story_dir, Path):
+        story_dir = Path(story_dir)
+    sound_dir = story_dir / "sound"
+    image_dir = story_dir / "image"
+    speech_dir = story_dir / "speech"
+    video_clips = []
+    # audio_durations = []
+    cur_duration = 0
+    timestamps = []
+    for page in trange(1, num_pages + 1):
+        ##### speech track
+        slide_silence = AudioArrayClip(np.zeros((int(audio_sample_rate * slide_duration), 2)), fps=audio_sample_rate)
+        fade_silence = AudioArrayClip(np.zeros((int(audio_sample_rate * fade_duration), 2)), fps=audio_sample_rate)
+        if (speech_dir / f"p{page}.wav").exists(): # single speech file
+            single_utterance = True
+            speech_file = (speech_dir / f"./p{page}.wav").__str__()
+            speech_clip = AudioFileClip(speech_file, fps=audio_sample_rate)
+            # speech_clip = speech_clip.audio_fadein(fade_duration)
+            speech_clip = concatenate_audioclips([fade_silence, speech_clip, fade_silence])
+        else: # multiple speech files
+            single_utterance = False
+            speech_files = list(speech_dir.glob(f"p{page}_*.wav"))
+            speech_files = sorted(speech_files, key=lambda x: int(x.stem.split("_")[-1]))
+            speech_clips = []
+            for utt_idx, speech_file in enumerate(speech_files):
+                speech_clip = AudioFileClip(speech_file.__str__(), fps=audio_sample_rate)
+                # add multiple timestamps of the same speech clip
+                if utt_idx == 0:
+                    timestamps.append([cur_duration + fade_duration,
+                                       cur_duration + fade_duration + speech_clip.duration])
+                    cur_duration += speech_clip.duration + fade_duration
+                elif utt_idx == len(speech_files) - 1:
+                    timestamps.append([
+                        cur_duration,
+                        cur_duration + speech_clip.duration
+                    ])
+                    cur_duration += speech_clip.duration + fade_duration + slide_duration
+                else:
+                    timestamps.append([
+                        cur_duration,
+                        cur_duration + speech_clip.duration
+                    ])
+                    cur_duration += speech_clip.duration
+                speech_clips.append(speech_clip)
+            speech_clip = concatenate_audioclips([fade_silence] + speech_clips + [fade_silence])
+            speech_file = speech_files[0] # for energy calculation
+        # add slide silence
+        if page == 1:
+            speech_clip = concatenate_audioclips([speech_clip, slide_silence])
+        else:
+            speech_clip = concatenate_audioclips([slide_silence, speech_clip, slide_silence])
+        # add the timestamp of the whole clip as a single element
+        if single_utterance:
+            if page == 1:
+                timestamps.append([cur_duration + fade_duration,
+                                   cur_duration + speech_clip.duration - fade_duration - slide_duration])
+                cur_duration += speech_clip.duration - slide_duration
+            else:
+                timestamps.append([cur_duration + fade_duration + slide_duration,
+                                   cur_duration + speech_clip.duration - fade_duration - slide_duration])
+                cur_duration += speech_clip.duration - slide_duration
+        speech_array, _ = librosa.core.load(speech_file, sr=None)
+        speech_rms = librosa.feature.rms(y=speech_array)[0].mean()
+        # set image as the main content, align the duration
+        image_file = (image_dir / f"./p{page}.png").__str__()
+        image_clip = ImageClip(image_file)
+        image_clip = image_clip.set_duration(speech_clip.duration).set_fps(fps)
+        image_clip = image_clip.crossfadein(fade_duration).crossfadeout(fade_duration)
+        if random.random() <= 0.5: # zoom in or zoom out
+            if random.random() <= 0.5:
+                zoom_mode = "in"
+            else:
+                zoom_mode = "out"
+            image_clip = add_zoom_effect(image_clip, zoom_speed, zoom_mode)
+        else: # move left or right
+            if random.random() <= 0.5:
+                direction = "left"
+            else:
+                direction = "right"
+            image_clip = add_move_effect(image_clip, direction=direction, move_raito=move_ratio)
+        # sound track
+        sound_file = sound_dir / f"p{page}.wav"
+        if sound_file.exists():
+            sound_clip = AudioFileClip(sound_file.__str__(), fps=audio_sample_rate)
+            sound_clip = sound_clip.audio_fadein(fade_duration)
+            if sound_clip.duration < speech_clip.duration:
+                sound_clip = audio_loop(sound_clip, duration=speech_clip.duration)
+            else:
+                sound_clip = sound_clip.subclip(0, speech_clip.duration)
+            sound_array, _ = librosa.core.load(sound_file.__str__(), sr=None)
+            sound_rms = librosa.feature.rms(y=sound_array)[0].mean()
+            ratio = speech_rms / sound_rms * bg_speech_ratio
+            audio_clip = CompositeAudioClip([speech_clip, sound_clip.volumex(sound_volume * ratio).audio_fadeout(fade_duration)])
+        else:
+            audio_clip = speech_clip
+        video_clip = image_clip.set_audio(audio_clip)
+        video_clips.append(video_clip)
+        # audio_durations.append(audio_clip.duration)
+    # final_clip = concatenate_videoclips(video_clips, method="compose")
+    composite_clip = add_slide_effect(video_clips, slide_duration=slide_duration)
+    composite_clip = add_bottom_black_area(composite_clip, black_area_height=caption_config["area_height"])
+    del caption_config["area_height"]
+    composite_clip = add_caption(
+        captions,
+        story_dir / "captions.srt",
+        timestamps,
+        composite_clip,
+        max_single_caption_length,
+        **caption_config
+    )
+    # add music track, align the duration
+    music_clip = AudioFileClip(music_path.__str__(), fps=audio_sample_rate)
+    music_array, _ = librosa.core.load(music_path.__str__(), sr=None)
+    music_rms = librosa.feature.rms(y=music_array)[0].mean()
+    ratio = speech_rms / music_rms * bg_speech_ratio
+    if music_clip.duration < composite_clip.duration:
+        music_clip = audio_loop(music_clip, duration=composite_clip.duration)
+    else:
+        music_clip = music_clip.subclip(0, composite_clip.duration)
+    all_audio_clip = CompositeAudioClip([composite_clip.audio, music_clip.volumex(music_volume * ratio)])
+    composite_clip = composite_clip.set_audio(all_audio_clip)
+    composite_clip.write_videofile(save_path.__str__(),
+                                   audio_fps=audio_sample_rate,
+                                   audio_codec=audio_codec,)
+class VideoComposeAgent:
+    def adjust_caption_config(self, width, height):
+        area_height = int(height * 0.06)
+        fontsize = int((width + height) / 2 * 0.025)
+        return {
+            "fontsize": fontsize,
+            "area_height": area_height
+        }
+    def call(self, pages, config):
+        height = config["image_generation"]["obj_cfg"]["height"]
+        width = config["image_generation"]["obj_cfg"]["width"]
+        config["caption_config"].update(self.adjust_caption_config(width, height))
+        compose_video(
+            story_dir=Path(config["story_dir"]),
+            save_path=Path(config["story_dir"]) / "output.mp4",
+            captions=pages,
+            music_path=Path(config["story_dir"]) / "music/music.wav",
+            num_pages=len(pages),
+            audio_sample_rate=config["audio_sample_rate"],
+            audio_codec=config["audio_codec"],
+            caption_config=config["caption_config"],
+            max_single_caption_length=config["max_single_caption_length"],
+            **config["slideshow_effect"]
+        )

nls-1.0.0-py3-none-any.whl ADDED Viewed

Binary file (47 kB). View file

policy.xml ADDED Viewed

	@@ -0,0 +1,99 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE policymap [
+  <!ELEMENT policymap (policy)*>
+  <!ATTLIST policymap xmlns CDATA #FIXED ''>
+  <!ELEMENT policy EMPTY>
+  <!ATTLIST policy xmlns CDATA #FIXED '' domain NMTOKEN #REQUIRED
+    name NMTOKEN #IMPLIED pattern CDATA #IMPLIED rights NMTOKEN #IMPLIED
+    stealth NMTOKEN #IMPLIED value CDATA #IMPLIED>
+]>
+<!--
+  Configure ImageMagick policies.
+  Domains include system, delegate, coder, filter, path, or resource.
+  Rights include none, read, write, execute and all.  Use | to combine them,
+  for example: "read | write" to permit read from, or write to, a path.
+  Use a glob expression as a pattern.
+  Suppose we do not want users to process MPEG video images:
+    <policy domain="delegate" rights="none" pattern="mpeg:decode" />
+  Here we do not want users reading images from HTTP:
+    <policy domain="coder" rights="none" pattern="HTTP" />
+  The /repository file system is restricted to read only.  We use a glob
+  expression to match all paths that start with /repository:
+    <policy domain="path" rights="read" pattern="/repository/*" />
+  Lets prevent users from executing any image filters:
+    <policy domain="filter" rights="none" pattern="*" />
+  Any large image is cached to disk rather than memory:
+    <policy domain="resource" name="area" value="1GP"/>
+  Use the default system font unless overwridden by the application:
+    <policy domain="system" name="font" value="/usr/share/fonts/favorite.ttf"/>
+  Define arguments for the memory, map, area, width, height and disk resources
+  with SI prefixes (.e.g 100MB).  In addition, resource policies are maximums
+  for each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
+  exceeds policy maximum so memory limit is 1GB).
+  Rules are processed in order.  Here we want to restrict ImageMagick to only
+  read or write a small subset of proven web-safe image types:
+    <policy domain="delegate" rights="none" pattern="*" />
+    <policy domain="filter" rights="none" pattern="*" />
+    <policy domain="coder" rights="none" pattern="*" />
+    <policy domain="coder" rights="read|write" pattern="{GIF,JPEG,PNG,WEBP}" />
+-->
+<policymap>
+  <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
+  <policy domain="resource" name="memory" value="256MiB"/>
+  <policy domain="resource" name="map" value="512MiB"/>
+  <policy domain="resource" name="width" value="16KP"/>
+  <policy domain="resource" name="height" value="16KP"/>
+  <!-- <policy domain="resource" name="list-length" value="128"/> -->
+  <policy domain="resource" name="area" value="128MP"/>
+  <policy domain="resource" name="disk" value="1GiB"/>
+  <!-- <policy domain="resource" name="file" value="768"/> -->
+  <!-- <policy domain="resource" name="thread" value="4"/> -->
+  <!-- <policy domain="resource" name="throttle" value="0"/> -->
+  <!-- <policy domain="resource" name="time" value="3600"/> -->
+  <!-- <policy domain="coder" rights="none" pattern="MVG" /> -->
+  <!-- <policy domain="module" rights="none" pattern="{PS,PDF,XPS}" /> -->
+  <!-- <policy domain="path" rights="none" pattern="@*" /> -->
+  <!-- <policy domain="cache" name="memory-map" value="anonymous"/> -->
+  <!-- <policy domain="cache" name="synchronize" value="True"/> -->
+  <!-- <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/>
+  <!-- <policy domain="system" name="max-memory-request" value="256MiB"/> -->
+  <!-- <policy domain="system" name="shred" value="2"/> -->
+  <!-- <policy domain="system" name="precision" value="6"/> -->
+  <!-- <policy domain="system" name="font" value="/path/to/font.ttf"/> -->
+  <!-- <policy domain="system" name="pixel-cache-memory" value="anonymous"/> -->
+  <!-- <policy domain="system" name="shred" value="2"/> -->
+  <!-- <policy domain="system" name="precision" value="6"/> -->
+  <!-- not needed due to the need to use explicitly by mvg: -->
+  <!-- <policy domain="delegate" rights="none" pattern="MVG" /> -->
+  <!-- use curl -->
+  <policy domain="delegate" rights="none" pattern="URL" />
+  <policy domain="delegate" rights="none" pattern="HTTPS" />
+  <policy domain="delegate" rights="none" pattern="HTTP" />
+  <!-- in order to avoid to get image with password text -->
+  <!-- <policy domain="path" rights="none" pattern="@*"/> -->
+  <!-- disable ghostscript format types -->
+  <policy domain="coder" rights="none" pattern="PS" />
+  <policy domain="coder" rights="none" pattern="PS2" />
+  <policy domain="coder" rights="none" pattern="PS3" />
+  <policy domain="coder" rights="none" pattern="EPS" />
+  <policy domain="coder" rights="none" pattern="PDF" />
+  <policy domain="coder" rights="none" pattern="XPS" />
+</policymap>

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+Pillow
+PyYAML
+pypinyin
+soundfile
+dashscope
+tqdm
+zhon
+numpy
+librosa
+moviepy
+opencv-python
+nls-1.0.0-py3-none-any.whl
+audiocraft