Upload 11 files

Browse files

Files changed (11) hide show

__init__.py +0 -0
config.yaml +63 -0
environment.yml +271 -0
fid_utils.py +41 -0
main.py +537 -0
pororo_100.h5 +3 -0
readme-storyvisualization.md +123 -0
requirements.txt +10 -0
run.sh +1 -0
test.py +94 -0
transtoyolo.py +320 -0

__init__.py ADDED Viewed

Binary file (2 Bytes). View file

config.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+# device
+mode: sample  # train sample
+gpu_ids: [3]  # gpu ids
+batch_size: 1  # batch size each item denotes one story
+num_workers: 4  # number of workers
+num_cpu_cores: -1  # number of cpu cores
+seed: 0  # random seed
+ckpt_dir: /root/lihui/StoryVisualization/save_ckpt_epoch5_new # checkpoint directory
+run_name: ARLDM # name for this run
+# task
+dataset: pororo  # pororo flintstones vistsis vistdii
+task: visualization  # continuation visualization
+# train
+init_lr: 1e-5  # initial learning rate
+warmup_epochs: 1  # warmup epochs
+max_epochs: 5   #50  # max epochs
+train_model_file: /root/lihui/StoryVisualization/save_ckpt_3last50/ARLDM/last.ckpt # model file for resume, none for train from scratch
+freeze_clip: True  #False  # whether to freeze clip
+freeze_blip: True  #False  # whether to freeze blip
+freeze_resnet: True  #False  # whether to freeze resnet
+# sample
+test_model_file: /root/lihui/StoryVisualization/save_ckpt_3last50/ARLDM/last.ckpt # model file for test
+calculate_fid: True  # whether to calculate FID scores
+scheduler: ddim  # ddim pndm
+guidance_scale: 6  # guidance scale
+num_inference_steps: 250  # number of inference steps
+sample_output_dir: /root/lihui/StoryVisualization/save_samples_128_epoch50 # output directory
+pororo:
+  hdf5_file: /root/lihui/StoryVisualization/pororo.h5
+  max_length: 85
+  new_tokens: [ "pororo", "loopy", "eddy", "harry", "poby", "tongtong", "crong", "rody", "petty" ]
+  clip_embedding_tokens: 49416
+  blip_embedding_tokens: 30530
+flintstones:
+  hdf5_file: /path/to/flintstones.h5
+  max_length: 91
+  new_tokens: [ "fred", "barney", "wilma", "betty", "pebbles", "dino", "slate" ]
+  clip_embedding_tokens: 49412
+  blip_embedding_tokens: 30525
+vistsis:
+  hdf5_file: /path/to/vist.h5
+  max_length: 100
+  clip_embedding_tokens: 49408
+  blip_embedding_tokens: 30524
+vistdii:
+  hdf5_file: /path/to/vist.h5
+  max_length: 65
+  clip_embedding_tokens: 49408
+  blip_embedding_tokens: 30524
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+hydra/job_logging: disabled
+hydra/hydra_logging: disabled

environment.yml ADDED Viewed

	@@ -0,0 +1,271 @@

+name: story
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py38h27cfd23_1003
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.01.10=h06a4308_0
+  - certifi=2022.12.7=py38h06a4308_0
+  - cffi=1.15.1=py38h5eee18b_3
+  - cryptography=39.0.1=py38h9ce1e76_0
+  - cuda-cudart=11.7.99=0
+  - cuda-cupti=11.7.101=0
+  - cuda-libraries=11.7.1=0
+  - cuda-nvrtc=11.7.99=0
+  - cuda-nvtx=11.7.91=0
+  - cuda-runtime=11.7.1=0
+  - ffmpeg=4.3=hf484d3e_0
+  - flit-core=3.8.0=py38h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py38h06a4308_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=11.10.3.66=0
+  - libcufft=10.7.2.124=h4fbf590_0
+  - libcufile=1.6.0.25=0
+  - libcurand=10.3.2.56=0
+  - libcusolver=11.4.0.1=0
+  - libcusparse=11.7.4.91=0
+  - libdeflate=1.17=h5eee18b_0
+  - libffi=3.4.2=h6a678d5_6
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.2=h7f8727e_0
+  - libnpp=11.7.4.75=0
+  - libnvjpeg=11.8.0.2=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.0=h6a678d5_2
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - lz4-c=1.9.4=h6a678d5_0
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py38h7f8727e_0
+  - mkl_fft=1.3.1=py38hd3c417c_0
+  - mkl_random=1.2.2=py38h51133e4_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - numpy-base=1.23.5=py38h31eccc5_0
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=1.1.1t=h7f8727e_0
+  - pip=23.0.1=py38h06a4308_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.0.0=py38h06a4308_0
+  - pysocks=1.7.1=py38h06a4308_0
+  - python=3.8.16=h7a1cb2a_3
+  - pytorch=1.13.1=py3.8_cuda11.7_cudnn8.5.0_0
+  - pytorch-cuda=11.7=h778d358_3
+  - pytorch-mutex=1.0=cuda
+  - readline=8.2=h5eee18b_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.41.1=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - typing_extensions=4.4.0=py38h06a4308_0
+  - urllib3=1.26.15=py38h06a4308_0
+  - wheel=0.38.4=py38h06a4308_0
+  - xz=5.2.10=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.4=hc292b87_0
+  - pip:
+    - absl-py==1.4.0
+    - accelerate==0.17.1
+    - aiofiles==23.1.0
+    - aiohttp==3.8.4
+    - aiosignal==1.3.1
+    - altair==4.2.2
+    - antlr4-python3-runtime==4.9.3
+    - anyio==3.6.2
+    - appdirs==1.4.4
+    - argon2-cffi==21.3.0
+    - argon2-cffi-bindings==21.2.0
+    - arrow==1.2.3
+    - asttokens==2.2.1
+    - async-timeout==4.0.2
+    - attrs==22.2.0
+    - backcall==0.2.0
+    - beautifulsoup4==4.11.2
+    - bleach==6.0.0
+    - cachetools==5.3.0
+    - chardet==5.1.0
+    - charset-normalizer==3.1.0
+    - click==8.1.3
+    - comm==0.1.2
+    - contourpy==1.0.7
+    - cycler==0.11.0
+    - debugpy==1.6.6
+    - decorator==5.1.1
+    - defusedxml==0.7.1
+    - diffusers==0.9.0
+    - docker-pycreds==0.4.0
+    - entrypoints==0.4
+    - executing==1.2.0
+    - fastapi==0.95.0
+    - fastjsonschema==2.16.3
+    - ffmpy==0.3.0
+    - filelock==3.10.0
+    - fire==0.5.0
+    - flatbuffers==23.3.3
+    - fonttools==4.39.3
+    - fqdn==1.5.1
+    - frozenlist==1.3.3
+    - fsspec==2023.3.0
+    - ftfy==6.1.1
+    - gitdb==4.0.10
+    - gitpython==3.1.31
+    - google-auth==2.16.2
+    - google-auth-oauthlib==0.4.6
+    - gradio==3.24.1
+    - gradio-client==0.0.5
+    - grpcio==1.51.3
+    - h11==0.14.0
+    - h5py==3.8.0
+    - httpcore==0.16.3
+    - httpx==0.23.3
+    - huggingface-hub==0.13.2
+    - hydra-core==1.3.2
+    - importlib-metadata==6.1.0
+    - importlib-resources==5.12.0
+    - ipykernel==6.21.3
+    - ipython==8.11.0
+    - ipython-genutils==0.2.0
+    - ipywidgets==8.0.4
+    - isoduration==20.11.0
+    - jedi==0.18.2
+    - jinja2==3.1.2
+    - jsonpointer==2.3
+    - jsonschema==4.17.3
+    - jupyter==1.0.0
+    - jupyter-client==8.0.3
+    - jupyter-console==6.6.3
+    - jupyter-core==5.3.0
+    - jupyter-events==0.6.3
+    - jupyter-server==2.5.0
+    - jupyter-server-terminals==0.4.4
+    - jupyterlab-pygments==0.2.2
+    - jupyterlab-widgets==3.0.5
+    - kiwisolver==1.4.4
+    - lightning-bolts==0.5.0
+    - linkify-it-py==2.0.0
+    - lora-diffusion==0.1.7
+    - markdown==3.4.1
+    - markdown-it-py==2.2.0
+    - markupsafe==2.1.2
+    - matplotlib==3.7.1
+    - matplotlib-inline==0.1.6
+    - mdit-py-plugins==0.3.3
+    - mdurl==0.1.2
+    - mediapipe==0.9.1.0
+    - mistune==2.0.5
+    - multidict==6.0.4
+    - nbclassic==0.5.3
+    - nbclient==0.7.2
+    - nbconvert==7.2.10
+    - nbformat==5.7.3
+    - nest-asyncio==1.5.6
+    - notebook==6.5.3
+    - notebook-shim==0.2.2
+    - numpy==1.24.2
+    - oauthlib==3.2.2
+    - omegaconf==2.3.0
+    - opencv-contrib-python==4.7.0.72
+    - opencv-python==4.7.0.72
+    - orjson==3.8.9
+    - packaging==23.0
+    - pandas==1.5.3
+    - pandocfilters==1.5.0
+    - parso==0.8.3
+    - pathtools==0.1.2
+    - pexpect==4.8.0
+    - pickleshare==0.7.5
+    - pillow==9.4.0
+    - pkgutil-resolve-name==1.3.10
+    - platformdirs==3.1.1
+    - prometheus-client==0.16.0
+    - prompt-toolkit==3.0.38
+    - protobuf==3.20.1
+    - psutil==5.9.4
+    - ptyprocess==0.7.0
+    - pure-eval==0.2.2
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pydantic==1.10.7
+    - pydeprecate==0.3.2
+    - pydub==0.25.1
+    - pygments==2.14.0
+    - pyparsing==3.0.9
+    - pyrsistent==0.19.3
+    - python-dateutil==2.8.2
+    - python-json-logger==2.0.7
+    - python-multipart==0.0.6
+    - pytorch-lightning==1.6.5
+    - pytz==2023.3
+    - pyyaml==6.0
+    - pyzmq==25.0.1
+    - qtconsole==5.4.1
+    - qtpy==2.3.0
+    - regex==2022.10.31
+    - requests==2.28.2
+    - requests-oauthlib==1.3.1
+    - rfc3339-validator==0.1.4
+    - rfc3986==1.5.0
+    - rfc3986-validator==0.1.1
+    - rsa==4.9
+    - safetensors==0.3.0
+    - scipy==1.10.1
+    - semantic-version==2.10.0
+    - send2trash==1.8.0
+    - sentry-sdk==1.17.0
+    - setproctitle==1.3.2
+    - setuptools==59.5.0
+    - smmap==5.0.0
+    - sniffio==1.3.0
+    - soupsieve==2.4
+    - stack-data==0.6.2
+    - starlette==0.26.1
+    - tensorboard==2.12.0
+    - tensorboard-data-server==0.7.0
+    - tensorboard-plugin-wit==1.8.1
+    - termcolor==2.2.0
+    - terminado==0.17.1
+    - timm==0.6.12
+    - tinycss2==1.2.1
+    - tokenizers==0.13.2
+    - toolz==0.12.0
+    - torch==1.9.0
+    - torchaudio==0.9.0
+    - torchmetrics==0.11.4
+    - torchvision==0.10.0+cu111
+    - tornado==6.2
+    - tqdm==4.65.0
+    - traitlets==5.9.0
+    - transformers==4.28.1
+    - typing-extensions==4.5.0
+    - uc-micro-py==1.0.1
+    - uri-template==1.2.0
+    - uvicorn==0.21.1
+    - wandb==0.14.0
+    - wcwidth==0.2.6
+    - webcolors==1.12
+    - webencodings==0.5.1
+    - websocket-client==1.5.1
+    - websockets==11.0
+    - werkzeug==2.2.3
+    - widgetsnbextension==4.0.5
+    - yarl==1.8.2
+    - zipp==3.15.0
+prefix: /root/anaconda3/envs/story

fid_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+from scipy import linalg
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, 'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, 'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        print('fid calculation produces singular product; adding %s to diagonal of cov estimates' % eps)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * np.trace(covmean)
+def calculate_fid_given_features(feature1, feature2):
+    mu1 = np.mean(feature1, axis=0)
+    sigma1 = np.cov(feature1, rowvar=False)
+    mu2 = np.mean(feature2, axis=0)
+    sigma2 = np.cov(feature2, rowvar=False)
+    fid_value = calculate_frechet_distance(mu1, sigma1, mu2, sigma2)
+    return fid_value

main.py ADDED Viewed

	@@ -0,0 +1,537 @@

+import inspect
+import os
+import cv2
+import hydra
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from PIL import Image
+from diffusers import AutoencoderKL, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler, DDIMScheduler
+from omegaconf import DictConfig
+from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DDPStrategy
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from transformers import CLIPTokenizer, CLIPTextModel
+from fid_utils import calculate_fid_given_features
+from lora_diffusion import monkeypatch_or_replace_lora, tune_lora_scale
+from models.blip_override.blip import blip_feature_extractor, init_tokenizer
+from models.diffusers_override.unet_2d_condition import UNet2DConditionModel
+from models.inception import InceptionV3
+unet_target_replace_module = {"CrossAttention", "Attention", "GEGLU"}
+#!/usr/bin/env python3
+from transformers import CLIPProcessor
+import transformers
+from PIL import Image
+import PIL.Image
+import numpy as np
+import torchvision.transforms as tvtrans
+import requests
+from io import BytesIO
+class LightningDataset(pl.LightningDataModule):
+    def __init__(self, args: DictConfig):
+        super(LightningDataset, self).__init__()
+        self.kwargs = {"num_workers": args.num_workers, "persistent_workers": True if args.num_workers > 0 else False,
+                       "pin_memory": True}
+        self.args = args
+    def setup(self, stage="fit"):
+        if self.args.dataset == "pororo":
+            import datasets.pororo as data
+        elif self.args.dataset == 'flintstones':
+            import datasets.flintstones as data
+        elif self.args.dataset == 'vistsis':
+            import datasets.vistsis as data
+        elif self.args.dataset == 'vistdii':
+            import datasets.vistdii as data
+        else:
+            raise ValueError("Unknown dataset: {}".format(self.args.dataset))
+        if stage == "fit":
+            self.train_data = data.StoryDataset("train", self.args)
+            self.val_data = data.StoryDataset("val", self.args)
+        if stage == "test":
+            self.test_data = data.StoryDataset("test", self.args)
+    def train_dataloader(self):
+        if not hasattr(self, 'trainloader'):
+            self.trainloader = DataLoader(self.train_data, batch_size=self.args.batch_size, shuffle=True, **self.kwargs)
+        return self.trainloader
+    def val_dataloader(self):
+        return DataLoader(self.val_data, batch_size=self.args.batch_size, shuffle=False, **self.kwargs)
+    def test_dataloader(self):
+        return DataLoader(self.test_data, batch_size=self.args.batch_size, shuffle=False, **self.kwargs)
+    def predict_dataloader(self):
+        return DataLoader(self.test_data, batch_size=self.args.batch_size, shuffle=False, **self.kwargs)
+    def get_length_of_train_dataloader(self):
+        if not hasattr(self, 'trainloader'):
+            self.trainloader = DataLoader(self.train_data, batch_size=self.args.batch_size, shuffle=True, **self.kwargs)
+        return len(self.trainloader)
+class ARLDM(pl.LightningModule):
+    def __init__(self, args: DictConfig, steps_per_epoch=1):
+        super(ARLDM, self).__init__()
+        self.args = args
+        self.steps_per_epoch = steps_per_epoch
+        """
+            Configurations
+        """
+        self.task = args.task
+        if args.mode == 'sample':
+            if args.scheduler == "pndm":
+                self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                               skip_prk_steps=True)
+            elif args.scheduler == "ddim":
+                self.scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                               clip_sample=False, set_alpha_to_one=True)
+            else:
+                raise ValueError("Scheduler not supported")
+            self.fid_augment = transforms.Compose([
+                transforms.Resize([64, 64]),
+                transforms.ToTensor(),
+                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+            ])
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[2048]
+            self.inception = InceptionV3([block_idx])
+        self.clip_tokenizer = CLIPTokenizer.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="tokenizer")
+        ##############################
+        #self.clip_tokenizer.save_pretrained('/root/lihui/StoryVisualization/save_pretrained/tokenizer')
+        self.blip_tokenizer = init_tokenizer()
+        self.blip_image_processor = transforms.Compose([
+            transforms.Resize([224, 224]),
+            transforms.ToTensor(),
+            transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711])
+        ])
+        self.max_length = args.get(args.dataset).max_length
+        blip_image_null_token = self.blip_image_processor(
+            Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))).unsqueeze(0).float()
+        clip_text_null_token = self.clip_tokenizer([""], padding="max_length", max_length=self.max_length,
+                                                   return_tensors="pt").input_ids
+        blip_text_null_token = self.blip_tokenizer([""], padding="max_length", max_length=self.max_length,
+                                                   return_tensors="pt").input_ids
+        self.register_buffer('clip_text_null_token', clip_text_null_token)
+        self.register_buffer('blip_text_null_token', blip_text_null_token)
+        self.register_buffer('blip_image_null_token', blip_image_null_token)
+        self.text_encoder = CLIPTextModel.from_pretrained('runwayml/stable-diffusion-v1-5',
+                                                          subfolder="text_encoder")
+        ############################################
+        #self.text_encoder.save_pretrained('/root/lihui/StoryVisualization/save_pretrained/text_encoder')
+        self.text_encoder.resize_token_embeddings(args.get(args.dataset).clip_embedding_tokens)
+        # resize_position_embeddings
+        old_embeddings = self.text_encoder.text_model.embeddings.position_embedding
+        new_embeddings = self.text_encoder._get_resized_embeddings(old_embeddings, self.max_length)
+        self.text_encoder.text_model.embeddings.position_embedding = new_embeddings
+        self.text_encoder.config.max_position_embeddings = self.max_length
+        self.text_encoder.max_position_embeddings = self.max_length
+        self.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_length).expand((1, -1))
+        self.modal_type_embeddings = nn.Embedding(2, 768)
+        self.time_embeddings = nn.Embedding(5, 768)
+        self.mm_encoder = blip_feature_extractor(
+            # pretrained='https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth',
+            pretrained='/root/lihui/StoryVisualization/save_pretrained/model_large.pth',
+            image_size=224, vit='large')#, local_files_only=True)
+        self.mm_encoder.text_encoder.resize_token_embeddings(args.get(args.dataset).blip_embedding_tokens)
+        self.vae = AutoencoderKL.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="vae")
+        self.unet = UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="unet")
+        self.noise_scheduler = DDPMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                             num_train_timesteps=1000)
+        # monkeypatch_or_replace_lora(
+        #     self.unet,
+        #     torch.load("lora/example_loras/analog_svd_rank4.safetensors"),
+        #     r=4,
+        #     target_replace_module=unet_target_replace_module,
+        # )
+        #
+        # tune_lora_scale(self.unet, 1.00)
+        #tune_lora_scale(self.text_encoder, 1.00)
+        # torch.manual_seed(0)
+        ###################################
+        #self.vae.save_pretrained('/root/lihui/StoryVisualization/save_pretrained/vae')
+        #self.unet.save_pretrained('/root/lihui/StoryVisualization/save_pretrained/unet')
+        # Freeze vae and unet
+        self.freeze_params(self.vae.parameters())
+        if args.freeze_resnet:
+            self.freeze_params([p for n, p in self.unet.named_parameters() if "attentions" not in n])
+        if args.freeze_blip and hasattr(self, "mm_encoder"):
+            self.freeze_params(self.mm_encoder.parameters())
+            self.unfreeze_params(self.mm_encoder.text_encoder.embeddings.word_embeddings.parameters())
+        if args.freeze_clip and hasattr(self, "text_encoder"):
+            self.freeze_params(self.text_encoder.parameters())
+            self.unfreeze_params(self.text_encoder.text_model.embeddings.token_embedding.parameters())
+    @staticmethod
+    def freeze_params(params):
+        for param in params:
+            param.requires_grad = False
+    @staticmethod
+    def unfreeze_params(params):
+        for param in params:
+            param.requires_grad = True
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.args.init_lr, weight_decay=1e-4)  # optim_bits=8
+        scheduler = LinearWarmupCosineAnnealingLR(optimizer,
+                                                  warmup_epochs=self.args.warmup_epochs * self.steps_per_epoch,
+                                                  max_epochs=self.args.max_epochs * self.steps_per_epoch)
+        optim_dict = {
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,  # The LR scheduler instance (required)
+                'interval': 'step',  # The unit of the scheduler's step size
+            }
+        }
+        return optim_dict
+    def forward(self, batch):
+        if self.args.freeze_clip and hasattr(self, "text_encoder"):
+            self.text_encoder.eval()
+        if self.args.freeze_blip and hasattr(self, "mm_encoder"):
+            self.mm_encoder.eval()
+        images, captions, attention_mask, source_images, source_caption, source_attention_mask, texts, ori_images = batch
+        B, V, S = captions.shape
+        src_V = V + 1 if self.task == 'continuation' else V
+        images = torch.flatten(images, 0, 1)
+        captions = torch.flatten(captions, 0, 1)
+        attention_mask = torch.flatten(attention_mask, 0, 1)
+        source_images = torch.flatten(source_images, 0, 1)
+        source_caption = torch.flatten(source_caption, 0, 1)
+        source_attention_mask = torch.flatten(source_attention_mask, 0, 1)
+        # 1 is not masked, 0 is maske
+        classifier_free_idx = np.random.rand(B * V) < 0.1
+        caption_embeddings = self.text_encoder(captions, attention_mask).last_hidden_state  # B * V, S, D
+        source_embeddings = self.mm_encoder(source_images, source_caption, source_attention_mask,
+                                            mode='multimodal').reshape(B, src_V * S, -1)
+        source_embeddings = source_embeddings.repeat_interleave(V, dim=0)
+        caption_embeddings[classifier_free_idx] = \
+            self.text_encoder(self.clip_text_null_token).last_hidden_state[0]
+        source_embeddings[classifier_free_idx] = \
+            self.mm_encoder(self.blip_image_null_token, self.blip_text_null_token, attention_mask=None,
+                            mode='multimodal')[0].repeat(src_V, 1)
+        caption_embeddings += self.modal_type_embeddings(torch.tensor(0, device=self.device))
+        source_embeddings += self.modal_type_embeddings(torch.tensor(1, device=self.device))
+        source_embeddings += self.time_embeddings(
+            torch.arange(src_V, device=self.device).repeat_interleave(S, dim=0))
+        encoder_hidden_states = torch.cat([caption_embeddings, source_embeddings], dim=1)
+        attention_mask = torch.cat(
+            [attention_mask, source_attention_mask.reshape(B, src_V * S).repeat_interleave(V, dim=0)], dim=1)
+        attention_mask = ~(attention_mask.bool())  # B * V, (src_V + 1) * S
+        attention_mask[classifier_free_idx] = False
+        # B, V, V, S
+        square_mask = torch.triu(torch.ones((V, V), device=self.device)).bool()
+        square_mask = square_mask.unsqueeze(0).unsqueeze(-1).expand(B, V, V, S)
+        square_mask = square_mask.reshape(B * V, V * S)
+        attention_mask[:, -V * S:] = torch.logical_or(square_mask, attention_mask[:, -V * S:])
+        latents = self.vae.encode(images).latent_dist.sample()
+        latents = latents * 0.18215
+        noise = torch.randn(latents.shape, device=self.device)
+        bsz = latents.shape[0]
+        timesteps = torch.randint(0, self.noise_scheduler.num_train_timesteps, (bsz,), device=self.device).long()
+        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states, attention_mask).sample
+        loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+        return loss
+    def sample(self, batch):
+        original_images, captions, attention_mask, source_images, source_caption, source_attention_mask, texts, ori_test_images  = batch
+        B, V, S = captions.shape
+        src_V = V + 1 if self.task == 'continuation' else V
+        original_images = torch.flatten(original_images, 0, 1)
+        captions = torch.flatten(captions, 0, 1)
+        attention_mask = torch.flatten(attention_mask, 0, 1)
+        source_images = torch.flatten(source_images, 0, 1)
+        source_caption = torch.flatten(source_caption, 0, 1)
+        source_attention_mask = torch.flatten(source_attention_mask, 0, 1)
+        caption_embeddings = self.text_encoder(captions, attention_mask).last_hidden_state  # B * V, S, D
+        source_embeddings = self.mm_encoder(source_images, source_caption, source_attention_mask,
+                                            mode='multimodal').reshape(B, src_V * S, -1)
+        caption_embeddings += self.modal_type_embeddings(torch.tensor(0, device=self.device))
+        source_embeddings += self.modal_type_embeddings(torch.tensor(1, device=self.device))
+        source_embeddings += self.time_embeddings(
+            torch.arange(src_V, device=self.device).repeat_interleave(S, dim=0))
+        source_embeddings = source_embeddings.repeat_interleave(V, dim=0)
+        encoder_hidden_states = torch.cat([caption_embeddings, source_embeddings], dim=1)
+        attention_mask = torch.cat(
+            [attention_mask, source_attention_mask.reshape(B, src_V * S).repeat_interleave(V, dim=0)], dim=1)
+        attention_mask = ~(attention_mask.bool())  # B * V, (src_V + 1) * S
+        # B, V, V, S
+        square_mask = torch.triu(torch.ones((V, V), device=self.device)).bool()
+        square_mask = square_mask.unsqueeze(0).unsqueeze(-1).expand(B, V, V, S)
+        square_mask = square_mask.reshape(B * V, V * S)
+        attention_mask[:, -V * S:] = torch.logical_or(square_mask, attention_mask[:, -V * S:])
+        uncond_caption_embeddings = self.text_encoder(self.clip_text_null_token).last_hidden_state
+        uncond_source_embeddings = self.mm_encoder(self.blip_image_null_token, self.blip_text_null_token,
+                                                   attention_mask=None, mode='multimodal').repeat(1, src_V, 1)
+        uncond_caption_embeddings += self.modal_type_embeddings(torch.tensor(0, device=self.device))
+        uncond_source_embeddings += self.modal_type_embeddings(torch.tensor(1, device=self.device))
+        uncond_source_embeddings += self.time_embeddings(
+            torch.arange(src_V, device=self.device).repeat_interleave(S, dim=0))
+        uncond_embeddings = torch.cat([uncond_caption_embeddings, uncond_source_embeddings], dim=1)
+        uncond_embeddings = uncond_embeddings.expand(B * V, -1, -1)
+        encoder_hidden_states = torch.cat([uncond_embeddings, encoder_hidden_states])
+        uncond_attention_mask = torch.zeros((B * V, (src_V + 1) * S), device=self.device).bool()
+        uncond_attention_mask[:, -V * S:] = square_mask
+        attention_mask = torch.cat([uncond_attention_mask, attention_mask], dim=0)
+        attention_mask = attention_mask.reshape(2, B, V, (src_V + 1) * S)
+        images = list()
+        for i in range(V):
+            encoder_hidden_states = encoder_hidden_states.reshape(2, B, V, (src_V + 1) * S, -1)
+            new_image = self.diffusion(encoder_hidden_states[:, :, i].reshape(2 * B, (src_V + 1) * S, -1),
+                                       attention_mask[:, :, i].reshape(2 * B, (src_V + 1) * S),
+                                       512, 512, self.args.num_inference_steps, self.args.guidance_scale, 0.0)
+            images += new_image
+            new_image = torch.stack([self.blip_image_processor(im) for im in new_image]).to(self.device)
+            new_embedding = self.mm_encoder(new_image,  # B,C,H,W
+                                            source_caption.reshape(B, src_V, S)[:, i + src_V - V],
+                                            source_attention_mask.reshape(B, src_V, S)[:, i + src_V - V],
+                                            mode='multimodal')  # B, S, D
+            new_embedding = new_embedding.repeat_interleave(V, dim=0)
+            new_embedding += self.modal_type_embeddings(torch.tensor(1, device=self.device))
+            new_embedding += self.time_embeddings(torch.tensor(i + src_V - V, device=self.device))
+            encoder_hidden_states = encoder_hidden_states[1].reshape(B * V, (src_V + 1) * S, -1)
+            encoder_hidden_states[:, (i + 1 + src_V - V) * S:(i + 2 + src_V - V) * S] = new_embedding
+            encoder_hidden_states = torch.cat([uncond_embeddings, encoder_hidden_states])
+        return original_images, images, texts, ori_test_images
+    def training_step(self, batch, batch_idx):
+        loss = self(batch)
+        self.log('loss/train_loss', loss, on_step=True, on_epoch=False, sync_dist=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self(batch)
+        self.log('loss/val_loss', loss, on_step=False, on_epoch=True, sync_dist=True, prog_bar=True)
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        original_images, images, texts, ori_test_images = self.sample(batch)
+        if self.args.calculate_fid:
+            original_images = original_images.cpu().numpy().astype('uint8')
+            original_images = [Image.fromarray(im, 'RGB') for im in original_images]
+            # ori_test_images = torch.stack(ori_test_images).cpu().numpy().astype('uint8')
+            # ori_test_images = [Image.fromarray(im, 'RGB') for im in ori_test_images]
+            ori = self.inception_feature(original_images).cpu().numpy()
+            gen = self.inception_feature(images).cpu().numpy()
+        else:
+            ori = None
+            gen = None
+        return images, ori, gen, ori_test_images, texts
+    def diffusion(self, encoder_hidden_states, attention_mask, height, width, num_inference_steps, guidance_scale, eta):
+        latents = torch.randn((encoder_hidden_states.shape[0] // 2, self.unet.in_channels, height // 8, width // 8),
+                              device=self.device)
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents * self.scheduler.sigmas[0]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        for i, t in enumerate(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            # noise_pred = self.unet(latent_model_input, t, encoder_hidden_states).sample
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states, attention_mask).sample
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        return self.numpy_to_pil(image)
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image, 'RGB') for image in images]
+        return pil_images
+    def inception_feature(self, images):
+        images = torch.stack([self.fid_augment(image) for image in images])
+        images = images.type(torch.FloatTensor).to(self.device)
+        images = (images + 1) / 2
+        images = F.interpolate(images, size=(299, 299), mode='bilinear', align_corners=False)
+        pred = self.inception(images)[0]
+        if pred.shape[2] != 1 or pred.shape[3] != 1:
+            pred = F.adaptive_avg_pool2d(pred, output_size=(1, 1))
+        return pred.reshape(-1, 2048)
+def train(args: DictConfig) -> None:
+    dataloader = LightningDataset(args)
+    dataloader.setup('fit')
+   # dataloader.
+    model = ARLDM(args, steps_per_epoch=dataloader.get_length_of_train_dataloader())
+    logger = TensorBoardLogger(save_dir=os.path.join(args.ckpt_dir, args.run_name), name='log', default_hp_metric=False)
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=os.path.join(args.ckpt_dir, args.run_name),
+        save_top_k=0,
+        save_last=True
+    )
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    callback_list = [lr_monitor, checkpoint_callback]
+    trainer = pl.Trainer(
+        accelerator='gpu',
+        devices=args.gpu_ids,
+        max_epochs=args.max_epochs,
+        benchmark=True,
+        logger=logger,
+        log_every_n_steps=1,
+        callbacks=callback_list,
+        strategy=DDPStrategy(find_unused_parameters=False)
+    )
+    trainer.fit(model, dataloader, ckpt_path=args.train_model_file)
+def sample(args: DictConfig) -> None:
+    assert args.test_model_file is not None, "test_model_file cannot be None"
+    assert args.gpu_ids == 1 or len(args.gpu_ids) == 1, "Only one GPU is supported in test mode"
+    dataloader = LightningDataset(args)
+    dataloader.setup('test')
+    model = ARLDM.load_from_checkpoint(args.test_model_file, args=args, strict=False)
+    predictor = pl.Trainer(
+        accelerator='gpu',
+        devices=args.gpu_ids,
+        max_epochs=-1,
+        benchmark=True
+    )
+    predictions = predictor.predict(model, dataloader)
+    images = [elem for sublist in predictions for elem in sublist[0]]
+    ori_images = [elem for sublist in predictions for elem in sublist[3]]
+    ori_test_images = list()
+    if not os.path.exists(args.sample_output_dir):
+        try:
+            os.mkdir(args.sample_output_dir)
+        except:
+            pass
+    text_list = [elem for sublist in predictions for elem in sublist[4]]
+    ################################
+    # print(f"index: {index}")
+    num_images = len(images)
+    num_groups = (num_images + 4) // 5  # 计算总共需要的组数
+    for g in range(num_groups):
+        print('Story {}:'.format(g + 1))  # 打印组号
+        start_index = g * 5  # 当前组的起始索引
+        end_index = min(start_index + 5, num_images)  # 当前组的结束索引
+        for i in range(start_index, end_index):
+            print(text_list[i])  # 打印对应的文本
+            images[i].save(
+                os.path.join(args.sample_output_dir, 'group{:02d}_image{:02d}.png'.format(g + 1, i - start_index + 1)))
+            # ori_images[i] = ori_images[i]
+            ori_images_pil = Image.fromarray(np.uint8(ori_images[i].detach().cpu().squeeze().float().numpy())).convert("RGB")
+            ori_test_images.append(ori_images_pil)
+            ori_images_pil.save(
+                 os.path.join('/root/lihui/StoryVisualization/ori_test_images_epoch10', 'group{:02d}_image{:02d}.png'.format(g + 1, i - start_index + 1)))
+        # for i, im in enumerate(ori_images):
+        #     file_path = '/root/lihui/StoryVisualization/ori_test_images/image{}.png'.format(i)
+        #     cv2.imwrite(file_path, im)
+    if args.calculate_fid:
+        ori = np.array([elem for sublist in predictions for elem in sublist[1]])
+        gen = np.array([elem for sublist in predictions for elem in sublist[2]])
+        fid = calculate_fid_given_features(ori, gen)
+        print('FID: {}'.format(fid))
+@hydra.main(config_path=".", config_name="config")
+def main(args: DictConfig) -> None:
+    pl.seed_everything(args.seed)
+    if args.num_cpu_cores > 0:
+        torch.set_num_threads(args.num_cpu_cores)
+    if args.mode == 'train':
+        ############################
+        train(args)
+    elif args.mode == 'sample':
+        # dataloader = LightningDataset(args)
+        # dataloader.setup('test')
+        sample(args)
+if __name__ == '__main__':
+    main()

pororo_100.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b5d47440de7abbbbb2265e1d5ecbc1c5d4d3188434db3988cb13e7ec5fa7549
+size 69568

readme-storyvisualization.md ADDED Viewed

	@@ -0,0 +1,123 @@

+### 一、基于叙事文本的跨模态序列图像生成模型
+## 安装环境
+conda create -n arldm python=3.8
+conda activate arldm
+conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-lts
+cd /root/lihui/StoryVisualization
+pip install -r requirements.txt
+## 数据准备
+Download the PororoSV dataset here.
+To accelerate I/O, using the following scrips to convert your downloaded data to HDF5
+python data_script/pororo_hdf5.py
+--data_dir /path/to/pororo_data
+--save_path /path/to/save_hdf5_file
+## 配置文件config.yaml
+#device
+mode: sample # train sample
+ckpt_dir: /root/lihui/StoryVisualization/save_ckpt_epoch5_new # checkpoint directory
+run_name: ARLDM # name for this run
+#train
+train_model_file: /root/lihui/StoryVisualization/save_ckpt_3last50/ARLDM/last.ckpt # model file for resume, none for train from scratch
+#sample
+test_model_file: /root/lihui/StoryVisualization/save_ckpt_3last50/ARLDM/last.ckpt # model file for test
+sample_output_dir: /root/lihui/StoryVisualization/save_samples_128_epoch50 # output directory
+## 训练
+在 config.yaml 中指定您的目录和设备配置并运行：
+python main.py
+## 采样
+在 config.yaml 中指定您的目录和设备配置并运行：
+python main.py
+## 引用
+@article{pan2022synthesizing,
+  title={Synthesizing Coherent Story with Auto-Regressive Latent Diffusion Models},
+  author={Pan, Xichen and Qin, Pengda and Li, Yuhong and Xue, Hui and Chen, Wenhu},
+  journal={arXiv preprint arXiv:2211.10950},
+  year={2022}
+}
+### 二、基于Real-ESRGAN的超分算法
+Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data
+ [论文]   [项目主页]   [YouTube 视频]   [B站视频]   [Poster]   [PPT]
+Xintao Wang, Liangbin Xie, Chao Dong, Ying Shan
+Tencent ARC Lab; Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences
+## 环境
+Python >= 3.7 (推荐使用Anaconda或Miniconda)
+PyTorch >= 1.7
+## 安装
+1、直接进入已配好的文件夹
+cd /root/lihui/StoryVisualization/Real-ESRGAN
+2、或 把项目克隆到本地
+bash git clone https://github.com/xinntao/Real-ESRGAN.git cd Real-ESRGAN
+3、 安装各种依赖
+ ```bash
+   安装 basicsr - https://github.com/xinntao/BasicSR
+   #我们使用BasicSR来训练以及推断
+   pip install basicsr
+   #facexlib和gfpgan是用来增强人脸的
+   pip install facexlib pip install gfpgan pip install -r requirements.txt python setup.py develop
+   ```
+## 训练
+训练好的模型: RealESRGAN_x4plus_anime_6B
+有关waifu2x的更多信息和对比在anime_model.md中。
+## 下载模型
+wget https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth -P weights
+## 推断
+python inference_realesrgan.py -n RealESRGAN_x4plus_anime_6B -i inputs
+结果在results文件夹
+## BibTeX 引用
+@Article{wang2021realesrgan,
+    title={Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data},
+    author={Xintao Wang and Liangbin Xie and Chao Dong and Ying Shan},
+    journal={arXiv:2107.10833},
+    year={2021}
+}
+### 三、基于YOLOv5的目标角色检测算法
+## 安装
+克隆 repo，并要求在 Python>=3.7.0 环境中安装 requirements.txt ，且要求 PyTorch>=1.7 。
+git clone https://github.com/ultralytics/yolov5  # clone
+cd /root/lihui/StoryVisualization
+cd yolov5
+pip install -r requirements.txt  # install
+## 转换图片
+cd /root/lihui/StoryVisualization
+python transtoyolo.py
+## 使用 detect.py 推理
+detect.py 在各种来源上运行推理， 模型 自动从 最新的YOLOv5 release 中下载，并将结果保存到 runs/detect 。
+python detect.py --weights yolov5s.pt --source 0                               # webcam
+                                               img.jpg                         # image
+                                               vid.mp4                         # video
+                                               screen                          # screenshot
+                                               path/                           # directory
+                                               list.txt                        # list of images
+                                               list.streams                    # list of streams
+                                               'path/*.jpg'                    # glob
+                                               'https://youtu.be/Zgi9g1ksQHc'  # YouTube
+                                               'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP stream
+## 训练
+ 最新的 模型 和 数据集 将自动的从 YOLOv5 release 中下载。 YOLOv5n/s/m/l/x 在 V100 GPU 的训练时间为 1/2/4/6/8 天（ 多GPU 训练速度更快）。 尽可能使用更大的 --batch-size ，或通过 --batch-size -1 实现 YOLOv5 自动批处理 。下方显示的 batchsize 适用于 V100-16GB。
+python train.py --data xxx.yaml --epochs 500 --weights '' --cfg yolov5l --batch-size 64
+# xx.yaml文件为转换后的数据
+## 许可
+YOLOv5 在两种不同的 License 下可用：
+AGPL-3.0 License： 查看 License 文件的详细信息。
+企业License：在没有 AGPL-3.0 开源要求的情况下为商业产品开发提供更大的灵活性。典型用例是将 Ultralytics 软件和 AI 模型嵌入到商业产品和应用程序中。在以下位置申请企业许可证 Ultralytics 许可 。
+### 四、演示系统
+## 指定文件目录并运行：
+cd /root/lihui/StoryVisualization/visualsystem
+python main.py
+#
+Your identification has been saved in             .
+Your public key has been saved in C:\Users\30254/.ssh/id_ed25519.pub.

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+pytorch_lightning<1.7.0
+lightning-bolts
+transformers==4.24.0
+diffusers==0.7.2
+timm
+ftfy
+hydra-core
+opencv-python
+h5py
+scipy

run.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python main.py

test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import cv2
+import h5py
+import copy
+import os
+import random
+import numpy
+import numpy as np
+from PIL import Image
+def gettext(index):
+    with h5py.File('/root/lihui/StoryVisualization/pororo.h5', 'r') as h5:
+        story = list()
+        h5 = h5['test']
+        # 读取当前索引处的文本，并使用decode方法将其解码为UTF-8
+        texts = h5['text'][index].decode('utf-8').split('|')
+        symbol = '\n'
+        texts = symbol.join(texts)
+    texts = 'Story<' + str(index) + '> :' + '\n' + texts
+    print(texts)
+    return texts
+# for i in range(1000):
+#     gettext(i)
+# 截取前100的数据集
+#                                 ###正确的##############
+# # import h5py
+# # import numpy as np
+# # from PIL import Image
+# #
+# #
+# # # 创建名为“images”的子目录来保存图像
+# # os.makedirs("train_images", exist_ok=True)
+# #
+# # 创建一个h5文件
+# nf = h5py.File('/root/lihui/StoryVisualization/pororo_100.h5', "w")
+# with h5py.File('/root/lihui/StoryVisualization/pororo.h5', 'r') as f:
+#     test_group = f['test']
+#     texts = np.array(test_group['text'][()])
+#     ngroup = nf.create_group('test')
+#     ntext = ngroup.create_dataset('text', (100,), dtype=h5py.string_dtype(encoding='utf-8'))
+#     for i in range(100):
+#         ntext[i]=texts[i]
+#         print(f"样本 {i}:")
+#         # for j in range(5):
+#         #     # 创建一个固定的文件名来保存图像
+#         #     # filename = os.path.join("images", f"image_{i}_{j}.png")
+#         #     # # 将HDF5文件中的图像数据保存到文件中
+#         #     # with open(filename, "wb") as img_file:
+#         #     #     img_file.write(test_group[f'image{j}'][i])
+#         #     # 打印文本信息和文件名
+#         #     ntext[i]='|'.join(texts[i].decode('utf-8').split('|')[j])
+#         # print(f"图像{j}已保存到文件：{filename}")
+#         print(ntext[i])
+# nf.close()
+#保存测试集图像，随机截取视频帧
+with h5py.File(r'C:\Users\zjlab\Desktop\StoryVisualization\pororo.h5', 'r') as h5:
+    h5 = h5['test']
+    for index in range(len(h5['text'])):   #len(h5['text'])
+        # index = int(index + 1)
+        # print(index)
+        images = list()
+        for i in range(5):
+            # 从h5文件中读取一组图像和对应的文本。
+            im = h5['image{}'.format(i)][index]
+            # print(im)
+            # pil_img = Image.fromarray(im)
+            # # 保存图像
+            # pil_img.save(os.path.join('/root/lihui/StoryVisualization/ori_test_images', '{:04d}.png'.format(i)))
+            # 对每个图像解码
+            im = cv2.imdecode(im, cv2.IMREAD_COLOR)
+            # 随机选择一个128像素的图像切片
+            idx = random.randint(0, im.shape[0] / 128 - 1)
+            # 将切片后的图像加到images列表中
+            images.append(im[idx * 128: (idx + 1) * 128])
+        # 深拷贝，后续不随images变化
+        # ori_images = copy.deepcopy(images)
+        # 保存test原始图像
+    # for i, im in enumerate(images):
+    #     file_path = 'C:/Users/zjlab/Desktop/StoryVisualization/test_images/group{:02d}_image{:02d}.png'.format(
+    #             index + 1,
+    #             i + 1)
+    #     cv2.imwrite(file_path, im)
+            ori_images_pil = Image.fromarray(images[i])#numpy.uint8(images[i].detach().cpu().squeeze().float().numpy())).convert("RGB")
+            ori_images_pil.save(
+              os.path.join('C:/Users/zjlab/Desktop/StoryVisualization/test_images',
+                     'group{:02d}_image{:02d}.png'.format(index + 1,i + 1)))

transtoyolo.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# -*- coding: utf-8 -*-
+import os
+import numpy as np
+import json
+from glob import glob
+import cv2
+import shutil
+import yaml
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+# 获取当前路径
+ROOT_DIR = os.getcwd()
+'''
+统一图像格式
+'''
+def change_image_format(label_path=ROOT_DIR, suffix='.png'):
+    """
+    统一当前文件夹下所有图像的格式，如'.jpg'
+    :param suffix: 图像文件后缀
+    :param label_path:当前文件路径
+    :return:
+    """
+    externs = ['png', 'jpg', 'JPEG', 'BMP', 'bmp']
+    files = list()
+    # 获取尾缀在ecterns中的所有图像
+    for extern in externs:
+        files.extend(glob(label_path + "\\*." + extern))
+    # 遍历所有图像，转换图像格式
+    for file in files:
+        name = ''.join(file.split('.')[:-1])
+        file_suffix = file.split('.')[-1]
+        if file_suffix != suffix.split('.')[-1]:
+            # 重命名为jpg
+            new_name = name + suffix
+            # 读取图像
+            image = cv2.imread(file)
+            # 重新存图为jpg格式
+            cv2.imwrite(new_name, image)
+            # 删除旧图像
+            os.remove(file)
+'''
+读取所有json文件，获取所有的类别
+'''
+def get_all_class(file_list, label_path=ROOT_DIR):
+    """
+    从json文件中获取当前数据的所有类别
+    :param file_list:当前路径下的所有文件名
+    :param label_path:当前文件路径
+    :return:
+    """
+    # 初始化类别列表
+    classes = list()
+    # 遍历所有json,读取shape中的label值内容，添加到classes
+    for filename in tqdm(file_list):
+        json_path = os.path.join(label_path, filename + '.json')
+        json_file = json.load(open(json_path, "r", encoding="utf-8"))
+        for item in json_file["shapes"]:
+            label_class = item['label']
+            if label_class not in classes:
+                classes.append(label_class)
+    print('read file done')
+    return classes
+'''
+划分训练集、验证机、测试集
+'''
+def split_dataset(label_path, test_size=0.3, isUseTest=False, useNumpyShuffle=False):
+    """
+    将文件分为训练集，测试集和验证集
+    :param useNumpyShuffle: 使用numpy方法分割数据集
+    :param test_size: 分割测试集或验证集的比例
+    :param isUseTest: 是否使用测试集，默认为False
+    :param label_path:当前文件路径
+    :return:
+    """
+    # 获取所有json
+    files = glob(label_path + "\\*.json")
+    files = [i.replace("\\", "/").split("/")[-1].split(".json")[0] for i in files]
+    if useNumpyShuffle:
+        file_length = len(files)
+        index = np.arange(file_length)
+        np.random.seed(32)
+        np.random.shuffle(index) # 随机划分
+        test_files = None
+        # 是否有测试集
+        if isUseTest:
+            trainval_files, test_files = np.array(files)[index[:int(file_length * (1 - test_size))]], np.array(files)[
+                index[int(file_length * (1 - test_size)):]]
+        else:
+            trainval_files = files
+        # 划分训练集和测试集
+        train_files, val_files = np.array(trainval_files)[index[:int(len(trainval_files) * (1 - test_size))]], \
+                                 np.array(trainval_files)[index[int(len(trainval_files) * (1 - test_size)):]]
+    else:
+        test_files = None
+        if isUseTest:
+            trainval_files, test_files = train_test_split(files, test_size=test_size, random_state=55)
+        else:
+            trainval_files = files
+        train_files, val_files = train_test_split(trainval_files, test_size=test_size, random_state=55)
+    return train_files, val_files, test_files, files
+'''
+生成yolov5的训练、验证、测试集的文件夹
+'''
+def create_save_file(label_path=ROOT_DIR):
+    """
+    按照训练时的图像和标注路径创建文件夹
+    :param label_path:当前文件路径
+    :return:
+    """
+    # 生成训练集
+    train_image = os.path.join(label_path, 'train', 'images')
+    if not os.path.exists(train_image):
+        os.makedirs(train_image)
+    train_label = os.path.join(label_path, 'train', 'labels')
+    if not os.path.exists(train_label):
+        os.makedirs(train_label)
+    # 生成验证集
+    val_image = os.path.join(label_path, 'valid', 'images')
+    if not os.path.exists(val_image):
+        os.makedirs(val_image)
+    val_label = os.path.join(label_path, 'valid', 'labels')
+    if not os.path.exists(val_label):
+        os.makedirs(val_label)
+    # 生成测试集
+    test_image = os.path.join(label_path, 'test', 'images')
+    if not os.path.exists(test_image):
+        os.makedirs(test_image)
+    test_label = os.path.join(label_path, 'test', 'labels')
+    if not os.path.exists(test_label):
+        os.makedirs(test_label)
+    return train_image, train_label, val_image, val_label, test_image, test_label
+'''
+转换，根据图像大小，返回box框的中点和高宽信息
+'''
+def convert(size, box):
+    # 宽
+    dw = 1. / (size[0])
+    # 高
+    dh = 1. / (size[1])
+    x = (box[0] + box[1]) / 2.0 - 1
+    y = (box[2] + box[3]) / 2.0 - 1
+    # 宽
+    w = box[1] - box[0]
+    # 高
+    h = box[3] - box[2]
+    x = x * dw
+    w = w * dw
+    y = y * dh
+    h = h * dh
+    return x, y, w, h
+'''
+移动图像和标注文件到指定的训练集、验证集和测试集中
+'''
+def push_into_file(file, images, labels, label_path=ROOT_DIR, suffix='.jpg'):
+    """
+    最终生成在当前文件夹下的所有文件按image和label分别存在到训练集/验证集/测试集路径的文件夹下
+    :param file: 文件名列表
+    :param images: 存放images的路径
+    :param labels: 存放labels的路径
+    :param label_path: 当前文件路径
+    :param suffix: 图像文件后缀
+    :return:
+    """
+    # 遍历所有文件
+    for filename in file:
+        # 图像文件
+        image_file = os.path.join(label_path, filename + suffix)
+        # 标注文件
+        label_file = os.path.join(label_path, filename + '.txt')
+        # yolov5存放图像文件夹
+        if not os.path.exists(os.path.join(images, filename + suffix)):
+            try:
+                shutil.move(image_file, images)
+            except OSError:
+                pass
+        # yolov5存放标注文件夹
+        if not os.path.exists(os.path.join(labels, filename + suffix)):
+            try:
+                shutil.move(label_file, labels)
+            except OSError:
+                pass
+'''
+'''
+def json2txt(classes, txt_Name='allfiles', label_path=ROOT_DIR, suffix='.png'):
+    """
+    将json文件转化为txt文件，并将json文件存放到指定文件夹
+    :param classes: 类别名
+    :param txt_Name:txt文件，用来存放所有文件的路径
+    :param label_path:当前文件路径
+    :param suffix:图像文件后缀
+    :return:
+    """
+    store_json = os.path.join(label_path, 'json')
+    if not os.path.exists(store_json):
+        os.makedirs(store_json)
+    _, _, _, files = split_dataset(label_path)
+    if not os.path.exists(os.path.join(label_path, 'tmp')):
+        os.makedirs(os.path.join(label_path, 'tmp'))
+    list_file = open('tmp/%s.txt' % txt_Name, 'w')
+    for json_file_ in tqdm(files):
+        json_filename = os.path.join(label_path, json_file_ + ".json")
+        imagePath = os.path.join(label_path, json_file_ + suffix)
+        list_file.write('%s\n' % imagePath)
+        out_file = open('%s/%s.txt' % (label_path, json_file_), 'w')
+        json_file = json.load(open(json_filename, "r", encoding="utf-8"))
+        if os.path.exists(imagePath):
+            height, width, channels = cv2.imread(imagePath).shape
+            for multi in json_file["shapes"]:
+                if len(multi["points"][0]) == 0:
+                    out_file.write('')
+                    continue
+                points = np.array(multi["points"])
+                xmin = min(points[:, 0]) if min(points[:, 0]) > 0 else 0
+                xmax = max(points[:, 0]) if max(points[:, 0]) > 0 else 0
+                ymin = min(points[:, 1]) if min(points[:, 1]) > 0 else 0
+                ymax = max(points[:, 1]) if max(points[:, 1]) > 0 else 0
+                label = multi["label"]
+                if xmax <= xmin:
+                    pass
+                elif ymax <= ymin:
+                    pass
+                else:
+                    cls_id = classes.index(label)
+                    b = (float(xmin), float(xmax), float(ymin), float(ymax))
+                    bb = convert((width, height), b)
+                    out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+                    # print(json_filename, xmin, ymin, xmax, ymax, cls_id)
+        if not os.path.exists(os.path.join(store_json, json_file_ + '.json')):
+            try:
+                shutil.move(json_filename, store_json)
+            except OSError:
+                pass
+'''
+创建yaml文件
+'''
+def create_yaml(classes, label_path, isUseTest=False):
+    nc = len(classes)
+    if not isUseTest:
+        desired_caps = {
+            'path': label_path,
+            'train': 'train/images',
+            'val': 'valid/images',
+            'nc': nc,
+            'names': classes
+        }
+    else:
+        desired_caps = {
+            'path': label_path,
+            'train': 'train/images',
+            'val': 'valid/images',
+            'test': 'test/images',
+            'nc': nc,
+            'names': classes
+        }
+    yamlpath = os.path.join(label_path, "data" + ".yaml")
+    # 写入到yaml文件
+    with open(yamlpath, "w+", encoding="utf-8") as f:
+        for key, val in desired_caps.items():
+            yaml.dump({key: val}, f, default_flow_style=False)
+# 首先确保当前文件夹下的所有图片统一后缀，如.jpg，如果为其他后缀，将suffix改为对应的后缀，如.png
+def ChangeToYolo5(label_path=r"D:\storydata", suffix='.png', test_size=0.1, isUseTest=False):
+    """
+    生成最终标准格式的文件
+    :param test_size: 分割测试集或验证集的比例
+    :param label_path:当前文件路径
+    :param suffix: 文件后缀名
+    :param isUseTest: 是否使用测试集
+    :return:
+    """
+    # step1:统一图像格式
+    change_image_format(label_path)
+    # step2:根据json文件划分训练集、验证集、测试集
+    train_files, val_files, test_file, files = split_dataset(label_path, test_size=test_size, isUseTest=isUseTest)
+    # step3:根据json文件，获取所有类别
+    classes = get_all_class(files)
+    # step4:将json文件转化为txt文件，并将json文件存放到指定文件夹
+    json2txt(classes)
+    # step5:创建yolov5训练所需的yaml文件
+    create_yaml(classes, label_path, isUseTest=isUseTest)
+    # step6:生成yolov5的训练、验证、测试集的文件夹
+    train_image, train_label, val_image, val_label, test_image, test_label = create_save_file(label_path)
+    # step7:将所有图像和标注文件，移动到对应的训练集、验证集、测试集
+    push_into_file(train_files, train_image, train_label, suffix=suffix)  # 将文件移动到训练集文件中
+    push_into_file(val_files, val_image, val_label, suffix=suffix)  # 将文件移动到验证集文件夹中
+    if test_file is not None:  # 如果测试集存在，则将文件移动到测试集文件中
+        push_into_file(test_file, test_image, test_label, suffix=suffix)
+    print('create dataset done')
+if __name__ == "__main__":
+    ChangeToYolo5()