Spaces:

hysts
/

Text2Human

Running

App Files Files Community

hysts commited on Jun 4, 2022

Commit

b85284b

1 Parent(s): e4d1395

Add files

Browse files

Files changed (16) hide show

.gitattributes +1 -0
.gitignore +1 -0
.gitmodules +3 -0
.pre-commit-config.yaml +46 -0
.style.yapf +5 -0
Text2Human +1 -0
app.py +157 -0
model.py +134 -0
patch +169 -0
pose_images/000.png +3 -0
pose_images/001.png +3 -0
pose_images/002.png +3 -0
pose_images/003.png +3 -0
pose_images/004.png +3 -0
pose_images/005.png +3 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -1,3 +1,4 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

+*.png filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ pretrained_models

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "Text2Human"]
+	path = Text2Human
+	url = https://github.com/yumingj/Text2Human

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+exclude: ^(Text2Human|patch)
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.2.0
+  hooks:
+  - id: check-executables-have-shebangs
+  - id: check-json
+  - id: check-merge-conflict
+  - id: check-shebang-scripts-are-executable
+  - id: check-toml
+  - id: check-yaml
+  - id: double-quote-string-fixer
+  - id: end-of-file-fixer
+  - id: mixed-line-ending
+    args: ['--fix=lf']
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
+- repo: https://github.com/myint/docformatter
+  rev: v1.4
+  hooks:
+  - id: docformatter
+    args: ['--in-place']
+- repo: https://github.com/pycqa/isort
+  rev: 5.10.1
+  hooks:
+    - id: isort
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v0.812
+  hooks:
+    - id: mypy
+      args: ['--ignore-missing-imports']
+- repo: https://github.com/google/yapf
+  rev: v0.32.0
+  hooks:
+  - id: yapf
+    args: ['--parallel', '--in-place']
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.5.0
+  hooks:
+    - id: nbstripout
+      args: ['--extra-keys', 'metadata.interpreter metadata.kernelspec cell.metadata.pycharm']
+- repo: https://github.com/nbQA-dev/nbQA
+  rev: 1.3.1
+  hooks:
+    - id: nbqa-isort
+    - id: nbqa-yapf

.style.yapf ADDED Viewed

	@@ -0,0 +1,5 @@

+[style]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = false
+spaces_before_comment = 2
+split_before_logical_operator = true

Text2Human ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 6d38607df89651704000d0e6571bfc640d185a77

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python
+from __future__ import annotations
+import argparse
+import os
+import pathlib
+import subprocess
+import gradio as gr
+if os.getenv('SYSTEM') == 'spaces':
+    subprocess.call('pip uninstall -y mmcv-full'.split())
+    subprocess.call('pip install mmcv-full==1.5.2'.split())
+    subprocess.call('git apply ../patch'.split(), cwd='Text2Human')
+from model import Model
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default='cpu')
+    parser.add_argument('--theme', type=str)
+    parser.add_argument('--share', action='store_true')
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--disable-queue',
+                        dest='enable_queue',
+                        action='store_false')
+    return parser.parse_args()
+def set_example_image(example: list) -> dict:
+    return gr.Image.update(value=example[0])
+def set_example_text(example: list) -> dict:
+    return gr.Textbox.update(value=example[0])
+def main():
+    args = parse_args()
+    model = Model(args.device)
+    css = '''
+h1#title {
+  text-align: center;
+}
+#input-image  {
+  max-height: 300px;
+}
+#label-image {
+  height: 300px;
+}
+#result-image {
+  height: 300px;
+}
+'''
+    with gr.Blocks(theme=args.theme, css=css) as demo:
+        gr.Markdown('''<h1 id="title">Text2Human</h1>
+This is an unofficial demo for <a href="https://github.com/yumingj/Text2Human">https://github.com/yumingj/Text2Human</a>.
+''')
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    input_image = gr.Image(label='Input Pose Image',
+                                           type='pil',
+                                           elem_id='input-image')
+                with gr.Row():
+                    paths = sorted(pathlib.Path('pose_images').glob('*.png'))
+                    example_images = gr.Dataset(components=[input_image],
+                                                samples=[[path.as_posix()]
+                                                         for path in paths])
+            with gr.Column():
+                with gr.Row():
+                    label_image = gr.Image(label='Label Image',
+                                           type='numpy',
+                                           elem_id='label-image')
+                with gr.Row():
+                    shape_text = gr.Textbox(
+                        label='Shape Description',
+                        placeholder=
+                        '''<gender>, <sleeve length>, <length of lower clothing>, <outer clothing type>, <other accessories1>, ...
+Note: The outer clothing type and accessories can be omitted.''')
+                with gr.Row():
+                    shape_example_texts = gr.Dataset(
+                        components=[shape_text],
+                        samples=[['man, sleeveless T-shirt, long pants'],
+                                 ['woman, short-sleeve T-shirt, short jeans']])
+                with gr.Row():
+                    generate_label_button = gr.Button('Generate Label Image')
+            with gr.Column():
+                with gr.Row():
+                    result = gr.Image(label='Result',
+                                      type='numpy',
+                                      elem_id='result-image')
+                with gr.Row():
+                    texture_text = gr.Textbox(
+                        label='Texture Description',
+                        placeholder=
+                        '''<upper clothing texture>, <lower clothing texture>, <outer clothing texture>
+Note: Currently, only 5 types of textures are supported, i.e., pure color, stripe/spline, plaid/lattice, floral, denim.'''
+                    )
+                with gr.Row():
+                    texture_example_texts = gr.Dataset(
+                        components=[texture_text],
+                        samples=[['pure color, denim'], ['floral, stripe']])
+                with gr.Row():
+                    sample_steps = gr.Slider(10,
+                                             300,
+                                             value=10,
+                                             step=10,
+                                             label='Sample Steps')
+                with gr.Row():
+                    seed = gr.Slider(0, 1000000, value=0, step=1, label='Seed')
+                with gr.Row():
+                    generate_human_button = gr.Button('Generate Human')
+        gr.Markdown(
+            '<center><img src="https://visitor-badge.glitch.me/badge?page_id=hysts.text2human" alt="visitor badge"/></center>'
+        )
+        input_image.change(fn=model.process_pose_image,
+                           inputs=[input_image],
+                           outputs=None)
+        generate_label_button.click(fn=model.generate_label_image,
+                                    inputs=[shape_text],
+                                    outputs=[label_image])
+        generate_human_button.click(fn=model.generate_human,
+                                    inputs=[
+                                        texture_text,
+                                        sample_steps,
+                                        seed,
+                                    ],
+                                    outputs=[result])
+        example_images.click(fn=set_example_image,
+                             inputs=example_images,
+                             outputs=example_images.components)
+        shape_example_texts.click(fn=set_example_text,
+                                  inputs=shape_example_texts,
+                                  outputs=shape_example_texts.components)
+        texture_example_texts.click(fn=set_example_text,
+                                    inputs=texture_example_texts,
+                                    outputs=texture_example_texts.components)
+    demo.launch(
+        enable_queue=args.enable_queue,
+        server_port=args.port,
+        share=args.share,
+    )
+if __name__ == '__main__':
+    main()

model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from __future__ import annotations
+import os
+import pathlib
+import sys
+import zipfile
+import huggingface_hub
+import numpy as np
+import PIL.Image
+import torch
+sys.path.insert(0, 'Text2Human')
+from models.sample_model import SampleFromPoseModel
+from utils.language_utils import (generate_shape_attributes,
+                                  generate_texture_attributes)
+from utils.options import dict_to_nonedict, parse
+from utils.util import set_random_seed
+COLOR_LIST = [
+    (0, 0, 0),
+    (255, 250, 250),
+    (220, 220, 220),
+    (250, 235, 215),
+    (255, 250, 205),
+    (211, 211, 211),
+    (70, 130, 180),
+    (127, 255, 212),
+    (0, 100, 0),
+    (50, 205, 50),
+    (255, 255, 0),
+    (245, 222, 179),
+    (255, 140, 0),
+    (255, 0, 0),
+    (16, 78, 139),
+    (144, 238, 144),
+    (50, 205, 174),
+    (50, 155, 250),
+    (160, 140, 88),
+    (213, 140, 88),
+    (90, 140, 90),
+    (185, 210, 205),
+    (130, 165, 180),
+    (225, 141, 151),
+]
+class Model:
+    def __init__(self, device: str):
+        self.config = self._load_config()
+        self.config['device'] = device
+        self._download_models()
+        self.model = SampleFromPoseModel(self.config)
+    def _load_config(self) -> dict:
+        path = 'Text2Human/configs/sample_from_pose.yml'
+        config = parse(path, is_train=False)
+        config = dict_to_nonedict(config)
+        return config
+    def _download_models(self) -> None:
+        model_dir = pathlib.Path('pretrained_models')
+        if model_dir.exists():
+            return
+        token = os.getenv('HF_TOKEN')
+        path = huggingface_hub.hf_hub_download('hysts/Text2Human',
+                                               'orig/pretrained_models.zip',
+                                               use_auth_token=token)
+        model_dir.mkdir()
+        with zipfile.ZipFile(path) as f:
+            f.extractall(model_dir)
+    @staticmethod
+    def preprocess_pose_image(image: PIL.Image.Image) -> torch.Tensor:
+        image = np.array(
+            image.resize(
+                size=(256, 512),
+                resample=PIL.Image.Resampling.LANCZOS))[:, :, 2:].transpose(
+                    2, 0, 1).astype(np.float32)
+        image = image / 12. - 1
+        data = torch.from_numpy(image).unsqueeze(1)
+        return data
+    @staticmethod
+    def process_mask(mask: torch.Tensor) -> torch.Tensor:
+        seg_map = np.full(mask.shape[:-1], -1)
+        for index, color in enumerate(COLOR_LIST):
+            seg_map[np.sum(mask == color, axis=2) == 3] = index
+        assert (seg_map != -1).all()
+        return seg_map
+    @staticmethod
+    def postprocess(result: torch.Tensor) -> np.ndarray:
+        result = result.permute(0, 2, 3, 1)
+        result = result.detach().cpu().numpy()
+        result = result * 255
+        result = np.asarray(result[0, :, :, :], dtype=np.uint8)
+        return result
+    def process_pose_image(self, pose_image: PIL.Image.Image) -> None:
+        if pose_image is None:
+            return
+        data = self.preprocess_pose_image(pose_image)
+        self.model.feed_pose_data(data)
+    def generate_label_image(self, shape_text: str) -> np.ndarray:
+        shape_attributes = generate_shape_attributes(shape_text)
+        shape_attributes = torch.LongTensor(shape_attributes).unsqueeze(0)
+        self.model.feed_shape_attributes(shape_attributes)
+        self.model.generate_parsing_map()
+        self.model.generate_quantized_segm()
+        colored_segm = self.model.palette_result(self.model.segm[0].cpu())
+        mask = colored_segm.copy()
+        seg_map = self.process_mask(mask)
+        self.model.segm = torch.from_numpy(seg_map).unsqueeze(0).unsqueeze(
+            0).to(self.model.device)
+        self.model.generate_quantized_segm()
+        return colored_segm
+    def generate_human(self, texture_text: str, sample_steps: int,
+                       seed: int) -> np.ndarray:
+        set_random_seed(seed)
+        texture_attributes = generate_texture_attributes(texture_text)
+        texture_attributes = torch.LongTensor(texture_attributes)
+        self.model.feed_texture_attributes(texture_attributes)
+        self.model.generate_texture_map()
+        self.model.sample_steps = sample_steps
+        out = self.model.sample_and_refine()
+        res = self.postprocess(out)
+        return res

patch ADDED Viewed

	@@ -0,0 +1,169 @@

+diff --git a/models/hierarchy_inference_model.py b/models/hierarchy_inference_model.py
+index 3116307..5de661d 100644
+--- a/models/hierarchy_inference_model.py
++++ b/models/hierarchy_inference_model.py
+@@ -21,7 +21,7 @@ class VQGANTextureAwareSpatialHierarchyInferenceModel():
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.is_train = opt['is_train']
+         self.top_encoder = Encoder(
+diff --git a/models/hierarchy_vqgan_model.py b/models/hierarchy_vqgan_model.py
+index 4b0d657..0bf4712 100644
+--- a/models/hierarchy_vqgan_model.py
++++ b/models/hierarchy_vqgan_model.py
+@@ -20,7 +20,7 @@ class HierarchyVQSpatialTextureAwareModel():
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.top_encoder = Encoder(
+             ch=opt['top_ch'],
+             num_res_blocks=opt['top_num_res_blocks'],
+diff --git a/models/parsing_gen_model.py b/models/parsing_gen_model.py
+index 9440345..15a1ecb 100644
+--- a/models/parsing_gen_model.py
++++ b/models/parsing_gen_model.py
+@@ -22,7 +22,7 @@ class ParsingGenModel():
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.is_train = opt['is_train']
+         self.attr_embedder = ShapeAttrEmbedding(
+diff --git a/models/sample_model.py b/models/sample_model.py
+index 4c60e3f..5265cd0 100644
+--- a/models/sample_model.py
++++ b/models/sample_model.py
+@@ -23,7 +23,7 @@ class BaseSampleModel():
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         # hierarchical VQVAE
+         self.decoder = Decoder(
+@@ -123,7 +123,7 @@ class BaseSampleModel():
+     def load_top_pretrain_models(self):
+         # load pretrained vqgan
+-        top_vae_checkpoint = torch.load(self.opt['top_vae_path'])
++        top_vae_checkpoint = torch.load(self.opt['top_vae_path'], map_location=self.device)
+         self.decoder.load_state_dict(
+             top_vae_checkpoint['decoder'], strict=True)
+@@ -137,7 +137,7 @@ class BaseSampleModel():
+         self.top_post_quant_conv.eval()
+     def load_bot_pretrain_network(self):
+-        checkpoint = torch.load(self.opt['bot_vae_path'])
++        checkpoint = torch.load(self.opt['bot_vae_path'], map_location=self.device)
+         self.bot_decoder_res.load_state_dict(
+             checkpoint['bot_decoder_res'], strict=True)
+         self.decoder.load_state_dict(checkpoint['decoder'], strict=True)
+@@ -153,7 +153,7 @@ class BaseSampleModel():
+     def load_pretrained_segm_token(self):
+         # load pretrained vqgan for segmentation mask
+-        segm_token_checkpoint = torch.load(self.opt['segm_token_path'])
++        segm_token_checkpoint = torch.load(self.opt['segm_token_path'], map_location=self.device)
+         self.segm_encoder.load_state_dict(
+             segm_token_checkpoint['encoder'], strict=True)
+         self.segm_quantizer.load_state_dict(
+@@ -166,7 +166,7 @@ class BaseSampleModel():
+         self.segm_quant_conv.eval()
+     def load_index_pred_network(self):
+-        checkpoint = torch.load(self.opt['pretrained_index_network'])
++        checkpoint = torch.load(self.opt['pretrained_index_network'], map_location=self.device)
+         self.index_pred_guidance_encoder.load_state_dict(
+             checkpoint['guidance_encoder'], strict=True)
+         self.index_pred_decoder.load_state_dict(
+@@ -176,7 +176,7 @@ class BaseSampleModel():
+         self.index_pred_decoder.eval()
+     def load_sampler_pretrained_network(self):
+-        checkpoint = torch.load(self.opt['pretrained_sampler'])
++        checkpoint = torch.load(self.opt['pretrained_sampler'], map_location=self.device)
+         self.sampler_fn.load_state_dict(checkpoint, strict=True)
+         self.sampler_fn.eval()
+@@ -397,7 +397,7 @@ class SampleFromPoseModel(BaseSampleModel):
+                         [185, 210, 205], [130, 165, 180], [225, 141, 151]]
+     def load_shape_generation_models(self):
+-        checkpoint = torch.load(self.opt['pretrained_parsing_gen'])
++        checkpoint = torch.load(self.opt['pretrained_parsing_gen'], map_location=self.device)
+         self.shape_attr_embedder.load_state_dict(
+             checkpoint['embedder'], strict=True)
+diff --git a/models/transformer_model.py b/models/transformer_model.py
+index 7db0f3e..4523d17 100644
+--- a/models/transformer_model.py
++++ b/models/transformer_model.py
+@@ -21,7 +21,7 @@ class TransformerTextureAwareModel():
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.is_train = opt['is_train']
+         # VQVAE for image
+@@ -317,10 +317,10 @@ class TransformerTextureAwareModel():
+     def sample_fn(self, temp=1.0, sample_steps=None):
+         self._denoise_fn.eval()
+-        b, device = self.image.size(0), 'cuda'
++        b = self.image.size(0)
+         x_t = torch.ones(
+-            (b, np.prod(self.shape)), device=device).long() * self.mask_id
+-        unmasked = torch.zeros_like(x_t, device=device).bool()
++            (b, np.prod(self.shape)), device=self.device).long() * self.mask_id
++        unmasked = torch.zeros_like(x_t, device=self.device).bool()
+         sample_steps = list(range(1, sample_steps + 1))
+         texture_mask_flatten = self.texture_tokens.view(-1)
+@@ -336,11 +336,11 @@ class TransformerTextureAwareModel():
+         for t in reversed(sample_steps):
+             print(f'Sample timestep {t:4d}', end='\r')
+-            t = torch.full((b, ), t, device=device, dtype=torch.long)
++            t = torch.full((b, ), t, device=self.device, dtype=torch.long)
+             # where to unmask
+             changes = torch.rand(
+-                x_t.shape, device=device) < 1 / t.float().unsqueeze(-1)
++                x_t.shape, device=self.device) < 1 / t.float().unsqueeze(-1)
+             # don't unmask somewhere already unmasked
+             changes = torch.bitwise_xor(changes,
+                                         torch.bitwise_and(changes, unmasked))
+diff --git a/models/vqgan_model.py b/models/vqgan_model.py
+index 13a2e70..9c840f1 100644
+--- a/models/vqgan_model.py
++++ b/models/vqgan_model.py
+@@ -20,7 +20,7 @@ class VQModel():
+     def __init__(self, opt):
+         super().__init__()
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.encoder = Encoder(
+             ch=opt['ch'],
+             num_res_blocks=opt['num_res_blocks'],
+@@ -390,7 +390,7 @@ class VQImageSegmTextureModel(VQImageModel):
+     def __init__(self, opt):
+         self.opt = opt
+-        self.device = torch.device('cuda')
++        self.device = torch.device(opt['device'])
+         self.encoder = Encoder(
+             ch=opt['ch'],
+             num_res_blocks=opt['num_res_blocks'],

pose_images/000.png ADDED Viewed

Git LFS Details

SHA256: e109163ba1ebfe4c3323ac700e1e6dd9443d5d3cf7e468a3587de7fc40383fa8
Pointer size: 131 Bytes
Size of remote file: 116 kB

pose_images/001.png ADDED Viewed

Git LFS Details

SHA256: 4656ad02618a7760a7214a1d494b73439f1a651df1ee9e0052b2417804614a56
Pointer size: 131 Bytes
Size of remote file: 123 kB

pose_images/002.png ADDED Viewed

Git LFS Details

SHA256: 9e493d8e9d17f601b47cf7124a91916c9370b5a6dc9b081749ca3116743e8b3f
Pointer size: 131 Bytes
Size of remote file: 120 kB

pose_images/003.png ADDED Viewed

Git LFS Details

SHA256: bbdc5ba3553ed8d512061143db73beaf2adf13c55bc0fba291b5657e63ffbeb8
Pointer size: 130 Bytes
Size of remote file: 99 kB

pose_images/004.png ADDED Viewed

Git LFS Details

SHA256: 489a4c28711760b5c68f15b5bc94761c47c6f8fbb0fb307473736e6a08cf3991
Pointer size: 131 Bytes
Size of remote file: 149 kB

pose_images/005.png ADDED Viewed

Git LFS Details

SHA256: 03f08c831206f68beaa548c75272e932433f4ed65837698696805c92048e334c
Pointer size: 131 Bytes
Size of remote file: 153 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+einops==0.4.1
+lpips==0.1.4
+mmcv-full==1.5.2
+mmsegmentation==0.24.1
+numpy==1.22.3
+Pillow==9.1.1
+sentence-transformers==2.2.0
+tokenizers==0.12.1
+torch==1.11.0
+torchvision==0.12.0
+transformers==4.19.2