vittoriopippi commited on 13 days ago

Commit

fa0f216

1 Parent(s): 434bf7c

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -0
Groundtruth/gan.iam.test.gt.filter27 +0 -0
Groundtruth/gan.iam.tr_va.gt.filter27 +0 -0
README.md +99 -3
config.json +46 -0
configuration_vatrpp.py +82 -0
corpora_english/brown-azAZ.tr +0 -0
corpora_english/in_vocab.subset.tro.37 +114 -0
corpora_english/oov.common_words +79 -0
corpora_english/oov_words.txt +400 -0
create_style_sample.py +25 -0
data/create_data.py +469 -0
data/dataset.py +324 -0
data/iam_test.py +51 -0
data/show_dataset.py +149 -0
files/IAM-32-pa.pickle +3 -0
files/IAM-32.pickle +3 -0
files/cvl_model.pth +3 -0
files/english_words.txt +0 -0
files/files +1 -0
files/hwt.pth +3 -0
files/resnet_18_pretrained.pth +3 -0
files/unifont.pickle +3 -0
files/vatr.pth +3 -0
files/vatrpp.pth +3 -0
generate.py +49 -0
generate/__init__.py +5 -0
generate/authors.py +48 -0
generate/fid.py +63 -0
generate/ocr.py +72 -0
generate/page.py +57 -0
generate/text.py +24 -0
generate/util.py +15 -0
generate/writer.py +329 -0
generation_config.json +4 -0
hwt/config.json +46 -0
hwt/generation_config.json +4 -0
hwt/model.safetensors +3 -0
model.safetensors +3 -0
modeling_vatrpp.py +338 -0
models/BigGAN_layers.py +469 -0
models/BigGAN_networks.py +379 -0
models/OCR_network.py +193 -0
models/__init__.py +65 -0
models/blocks.py +190 -0
models/config.py +6 -0
models/inception.py +311 -0
models/model.py +894 -0
models/networks.py +98 -0
models/positional_encodings.py +257 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+taylor_swift.png
+test.py
+*.pyc

Groundtruth/gan.iam.test.gt.filter27 ADDED Viewed

The diff for this file is too large to render. See raw diff

Groundtruth/gan.iam.tr_va.gt.filter27 ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,3 +1,99 @@
----
-license: mit
----

+# Handwritten Text Generation from Visual Archetypes ++
+This repository includes the code for training the VATr++ Styled Handwritten Text Generation model.
+## Installation
+```bash
+conda create --name vatr python=3.9
+conda activate vatr
+conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
+git clone https://github.com/aimagelab/VATr.git && cd VATr
+pip install -r requirements.txt
+```
+[This folder](https://drive.google.com/drive/folders/13rJhjl7VsyiXlPTBvnp1EKkKEhckLalr?usp=sharing) contains the regular IAM dataset `IAM-32.pickle` and the modified version with attached punctuation marks `IAM-32-pa.pickle`.
+The folder also contains the synthetically pretrained weights for the encoder `resnet_18_pretrained.pth`.
+Please download these files and place them into the `files` folder.
+## Training
+To train the regular VATr model, use the following command. This uses the default settings from the paper.
+```bash
+python train.py
+```
+Useful arguments:
+```bash
+python train.py
+        --feat_model_path PATH  # path to the pretrained resnet 18 checkpoint. By default this is the synthetically pretrained model
+        --is_cycle              # use style cycle loss for training
+        --dataset DATASET       # dataset to use. Default IAM
+        --resume                # resume training from the last checkpoint with the same name
+        --wandb                 # use wandb for logging
+```
+Use the following arguments to apply full VATr++ training
+```bash
+python train.py
+        --d-crop-size 64 128          # Randomly crop input to discriminator to width 64 to 128
+        --text-augment-strength 0.4   # Text augmentation for adding more rare characters
+        --file-suffix pa              # Use the punctuation attached version of IAM
+        --augment-ocr                 # Augment the real images used to train the OCR model
+```
+### Pretraining dataset
+The model `resnet_18_pretrained.pth` was pretrained by using this dataset: [Font Square](https://github.com/aimagelab/font_square)
+## Generate Styled Handwritten Text Images
+We added some utility to generate handwritten text images using the trained model. These are used as follows:
+```bash
+python generate.py [ACTION] --checkpoint files/vatrpp.pth
+```
+The following actions are available with their respective arguments.
+### Custom Author
+Generate the given text for a custom author.
+```bash
+text  --text STRING     # String to generate
+      --text-path PATH  # Optional path to text file
+      --output PATH     # Optional output location, default: files/output.png
+      --style-folder PATH    # Optional style folder containing writer samples, default: 'files/style_samples/00'
+```
+Style samples for the author are needed. These can be automatically generated from an image of a page using `create_style_sample.py`.
+```bash
+python create_style_sample.py  --input-image PATH     # Path of the image to extract the style samples from.
+                               --output-folder PATH   # Folder where the style samples should be saved
+```
+### All Authors
+Generate some text for all authors of IAM. The output is saved to `saved_images/author_samples/`
+```bash
+authors --test-set        # Generate authors of test set, otherwise training set is generated
+        --checkpoint PATH # Checkpoint used to generate text, files/vatr.pth by default
+        --align           # Detect the bottom lines for each word and align them
+        --at-once         # Generate the whole sentence at once instead of word-by-word
+        --output-style    # Also save the style images used to generate the words
+```
+### Evaluation Images
+```bash
+fid --target_dataset_path PATH  # dataset file for which the test set will be generated
+    --dataset-path PATH         # dataset file from which style samples will be taken, for example the attached punctuation
+    --output PATH               # where to save the images, default is saved_images/fid
+    --checkpoint PATH           # Checkpoint used to generate text, files/vatr.pth by default
+    --all-epochs                # Generate evaluation images for all saved epochs available (checkpoint has to be a folder)
+    --fake-only                 # Only output fake images, no ground truth
+    --test-only                 # Only generate test set, not train set
+    --long-tail                 # Only generate words containing long tail characters
+```

config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "add_noise": false,
+  "alphabet": "Only thewigsofrcvdampbkuq.A-210xT5'MDL,RYHJ\"ISPWENj&BC93VGFKz();#:!7U64Q8?+*ZX/%",
+  "architectures": [
+    "VATrPP"
+  ],
+  "augment_ocr": false,
+  "batch_size": 8,
+  "corpus": "standard",
+  "d_crop_size": null,
+  "d_lr": 1e-05,
+  "dataset": "IAM",
+  "device": "cuda",
+  "english_words_path": "files/english_words.txt",
+  "epochs": 100000,
+  "feat_model_path": "files/resnet_18_pretrained.pth",
+  "file_suffix": null,
+  "g_lr": 5e-05,
+  "img_height": 32,
+  "is_cycle": false,
+  "label_encoder": "default",
+  "model_type": "emuru",
+  "no_ocr_loss": false,
+  "no_writer_loss": false,
+  "num_examples": 15,
+  "num_words": 3,
+  "num_workers": 0,
+  "num_writers": 339,
+  "ocr_lr": 5e-05,
+  "query_input": "unifont",
+  "resolution": 16,
+  "save_model": 5,
+  "save_model_history": 500,
+  "save_model_path": "saved_models",
+  "seed": 742,
+  "special_alphabet": "\u0391\u03b1\u0392\u03b2\u0393\u03b3\u0394\u03b4\u0395\u03b5\u0396\u03b6\u0397\u03b7\u0398\u03b8\u0399\u03b9\u039a\u03ba\u039b\u03bb\u039c\u03bc\u039d\u03bd\u039e\u03be\u039f\u03bf\u03a0\u03c0\u03a1\u03c1\u03a3\u03c3\u03c2\u03a4\u03c4\u03a5\u03c5\u03a6\u03c6\u03a7\u03c7\u03a8\u03c8\u03a9\u03c9",
+  "tag": "debug",
+  "text_aug_type": "proportional",
+  "text_augment_strength": 0.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
+  "vocab_size": 80,
+  "w_lr": 5e-05,
+  "wandb": false,
+  "writer_loss_weight": 1.0
+}

configuration_vatrpp.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from transformers import PretrainedConfig
+class VATrPPConfig(PretrainedConfig):
+    model_type = "emuru"
+    def __init__(self,
+                 feat_model_path='files/resnet_18_pretrained.pth',
+                 label_encoder='default',
+                 save_model_path='saved_models',
+                 dataset='IAM',
+                 english_words_path='files/english_words.txt',
+                 wandb=False,
+                 no_writer_loss=False,
+                 writer_loss_weight=1.0,
+                 no_ocr_loss=False,
+                 img_height=32,
+                 resolution=16,
+                 batch_size=8,
+                 num_examples=15,
+                 num_writers=339,
+                 alphabet='Only thewigsofrcvdampbkuq.A-210xT5\'MDL,RYHJ"ISPWENj&BC93VGFKz();#:!7U64Q8?+*ZX/%',
+                 special_alphabet='ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω',
+                 g_lr=0.00005,
+                 d_lr=0.00001,
+                 w_lr=0.00005,
+                 ocr_lr=0.00005,
+                 epochs=100000,
+                 num_workers=0,
+                 seed=742,
+                 num_words=3,
+                 is_cycle=False,
+                 add_noise=False,
+                 save_model=5,
+                 save_model_history=500,
+                 tag='debug',
+                 device='cuda',
+                 query_input='unifont',
+                 corpus="standard",
+                 text_augment_strength=0.0,
+                 text_aug_type="proportional",
+                 file_suffix=None,
+                 augment_ocr=False,
+                 d_crop_size=None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.feat_model_path = feat_model_path
+        self.label_encoder = label_encoder
+        self.save_model_path = save_model_path
+        self.dataset = dataset
+        self.english_words_path = english_words_path
+        self.wandb = wandb
+        self.no_writer_loss = no_writer_loss
+        self.writer_loss_weight = writer_loss_weight
+        self.no_ocr_loss = no_ocr_loss
+        self.img_height = img_height
+        self.resolution = resolution
+        self.batch_size = batch_size
+        self.num_examples = num_examples
+        self.num_writers = num_writers
+        self.alphabet = alphabet
+        self.special_alphabet = special_alphabet
+        self.g_lr = g_lr
+        self.d_lr = d_lr
+        self.w_lr = w_lr
+        self.ocr_lr = ocr_lr
+        self.epochs = epochs
+        self.num_workers = num_workers
+        self.seed = seed
+        self.num_words = num_words
+        self.is_cycle = is_cycle
+        self.add_noise = add_noise
+        self.save_model = save_model
+        self.save_model_history = save_model_history
+        self.tag = tag
+        self.device = device
+        self.query_input = query_input
+        self.corpus = corpus
+        self.text_augment_strength = text_augment_strength
+        self.text_aug_type = text_aug_type
+        self.file_suffix = file_suffix
+        self.augment_ocr = augment_ocr
+        self.d_crop_size = d_crop_size

corpora_english/brown-azAZ.tr ADDED Viewed

The diff for this file is too large to render. See raw diff

corpora_english/in_vocab.subset.tro.37 ADDED Viewed

	@@ -0,0 +1,114 @@

+accents
+fifty
+gross
+Tea
+whom
+renamed
+Heaven
+Harry
+arrange
+captain
+why
+Father
+beaten
+Bar
+base
+creamy
+About
+Allies
+sound
+farmers
+anyone
+steel
+Mary
+used
+fever
+looking
+lately
+returns
+humans
+finals
+beyond
+lots
+waiting
+cited
+measure
+posse
+blow
+blonde
+twice
+Having
+compels
+rooms
+cocked
+virtual
+dying
+tons
+Travel
+idea
+gripped
+Act
+reign
+moods
+altered
+sample
+Soviet
+thick
+enigma
+here
+egghead
+Public
+Bryan
+porous
+estate
+guilty
+Caught
+Lucas
+observe
+mouth
+pricked
+obscure
+casual
+take
+home
+amber
+weekend
+forming
+aid
+outlook
+uniting
+But
+earnest
+bear
+news
+sparked
+merrily
+extreme
+North
+damned
+big
+bosses
+context
+easily
+took
+hurried
+Gene
+due
+deserve
+cult
+leisure
+critics
+parish
+Music
+charge
+grey
+Privy
+Fred
+massive
+others
+shirt
+average
+warning
+Tuesday
+locked
+possess

corpora_english/oov.common_words ADDED Viewed

	@@ -0,0 +1,79 @@

+planets
+lips
+varies
+impact
+skips
+Gold
+maple
+voyager
+noisy
+stick
+forums
+drafts
+crimson
+sever
+rackets
+sexy
+humming
+cheated
+lick
+grades
+heroic
+Clever
+foul
+mood
+warrior
+Morning
+poetic
+nodding
+certify
+reviews
+mosaics
+senders
+Isle
+Lied
+sand
+Weight
+writer
+trusts
+slot
+eaten
+squares
+lists
+vary
+witches
+compose
+demons
+therapy
+focus
+sticks
+Whose
+bumped
+visibly
+redeem
+arsenal
+lunatic
+Similar
+Bug
+adheres
+trail
+robbing
+Whisky
+super
+screwed
+Flower
+salads
+Glow
+Vapor
+Married
+recieve
+handle
+push
+card
+skiing
+lotus
+cloud
+windy
+monkey
+virus
+thunder

corpora_english/oov_words.txt ADDED Viewed

	@@ -0,0 +1,400 @@

+planets
+lips
+varies
+impact
+skips
+Gold
+maple
+voyager
+noisy
+stick
+forums
+drafts
+crimson
+sever
+rackets
+sexy
+humming
+cheated
+lick
+grades
+heroic
+Clever
+foul
+mood
+warrior
+Morning
+poetic
+nodding
+certify
+reviews
+mosaics
+senders
+Isle
+Lied
+sand
+Weight
+writer
+trusts
+slot
+eaten
+squares
+lists
+vary
+witches
+compose
+demons
+therapy
+focus
+sticks
+Whose
+bumped
+visibly
+redeem
+arsenal
+lunatic
+Similar
+Bug
+adheres
+trail
+robbing
+Whisky
+super
+screwed
+Flower
+salads
+Glow
+Vapor
+Married
+recieve
+handle
+push
+card
+skiing
+lotus
+cloud
+windy
+monkey
+virus
+thunder
+Keegan
+purling
+Orpheus
+Prence
+Yin
+Kansas
+jowls
+Alabama
+Szold
+Chou
+Orange
+suspend
+barred
+deceit
+reward
+soy
+Vail
+lad
+Loesser
+Hutton
+jerks
+yelling
+Heywood
+sacker
+comest
+tense
+par
+fiend
+Soiree
+voted
+Putting
+pansy
+doormen
+mayor
+Owens
+noting
+pauses
+USP
+crudely
+grooved
+furor
+ignited
+kittens
+broader
+slang
+ballets
+quacked
+Paulus
+Castles
+upswing
+dabbled
+Animals
+Kidder
+Writers
+laces
+bled
+scoped
+yield
+scoured
+Schenk
+Wratten
+Menfolk
+foamy
+scratch
+minced
+nudged
+Seats
+Judging
+Turbine
+Strict
+whined
+crupper
+Dussa
+finned
+voter
+Jacobs
+calmly
+hip
+clubs
+quintet
+blunts
+Grazie
+Barton
+NAB
+specie
+Fonta
+narrow
+Swan
+denials
+Rawson
+potato
+Choral
+diverse
+Educate
+unities
+Ferry
+Bonner
+manuals
+NAIR
+imputed
+initial
+wallet
+Sesame
+maroon
+Related
+Quiney
+Monster
+brainy
+Nolan
+Thrifty
+Tel
+Ye
+Sumter
+Bonnet
+sheepe
+nagged
+ribbing
+hunt
+AA
+Pohly
+triol
+saws
+popped
+aloof
+Ceramic
+thong
+typed
+broadly
+Figures
+riddle
+Otis
+Sainted
+upbeat
+Getting
+hisself
+junta
+Labans
+starter
+coward
+Anthea
+hurlers
+Dervish
+Turin
+oud
+tyranny
+Rotary
+Veneto
+pulls
+bowl
+utopias
+auburn
+osmotic
+myrtle
+furrow
+laws
+Uh
+Hodges
+Wilde
+Neck
+snaked
+decorum
+edema
+Dunston
+clinics
+Abide
+Dover
+voltaic
+Modern
+Farr
+thaw
+moi
+leaning
+wedlock
+Carson
+star
+Hymn
+Stack
+genes
+Shayne
+Moune
+slipped
+legatee
+coerced
+Gates
+pulse
+Granny
+bat
+Fruit
+Cadesi
+Tee
+Dreiser
+Getz
+Ways
+cogs
+hydrous
+sweep
+quarrel
+mobcaps
+slash
+throats
+Royaux
+cafes
+crusher
+rusted
+Eskimo
+slatted
+pallet
+yelps
+slanted
+confide
+Gomez
+untidy
+Sigmund
+Marine
+roll
+NRL
+Dukes
+tumours
+LP
+turtles
+audible
+Woodrow
+retreat
+Orders
+Conlow
+hobby
+skin
+tally
+frosted
+drowned
+wedged
+queen
+poised
+eluded
+Letter
+ticking
+kill
+rancor
+Plant
+Brandel
+Willows
+riddles
+carven
+Spiller
+yen
+jerky
+tenure
+daubed
+Serves
+pimpled
+ACTH
+ruh
+afield
+suffuse
+muffins
+Miners
+Cabrini
+weakly
+upriver
+Newsom
+Meeker
+weed
+fiscal
+Diane
+Errors
+Mig
+biz
+Drink
+chop
+Bumbry
+Babin
+optimum
+Leyden
+enrage
+induces
+newel
+trim
+bolts
+frog
+cinder
+Lo
+clobber
+Mennen
+Othon
+Ocean
+jerking
+engine
+Belasco
+hero
+flora
+Injuns
+Rico
+Gary
+snake
+hating
+Suggs
+booze
+Lescaut
+Molard
+startle
+Aggie
+lengthy
+Shoals
+ideals
+Zen
+stem
+noon
+hoes
+Seafood
+yuh
+Mostly
+seeds
+bestow
+acetate
+jokers
+waning
+volumes
+ein
+Rich
+Galt
+pasted

create_style_sample.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import argparse
+import cv2
+from util.vision import get_page, get_words
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-image", type=str, required=True)
+    parser.add_argument("--output-folder", type=str, required=True, default='files/style_samples/00')
+    args = parser.parse_args()
+    image = cv2.imread(args.input_image)
+    image = cv2.resize(image, (image.shape[1], image.shape[0]))
+    result = get_page(image)
+    words, _ = get_words(result)
+    output_path = args.output_folder
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    for i, word in enumerate(words):
+        cv2.imwrite(os.path.join(output_path, f"word{i}.png"), word)

data/create_data.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import gzip
+import json
+import os
+import pickle
+import random
+from collections import defaultdict
+import PIL
+import cv2
+import numpy as np
+from PIL import Image
+TO_MERGE = {
+    '.': 'left',
+    ',': 'left',
+    '!': 'left',
+    '?': 'left',
+    '(': 'right',
+    ')': 'left',
+    '\"': 'random',
+    "\'": 'random',
+    ":": 'left',
+    ";": 'left',
+    "-": 'random'
+}
+FILTER_ERR = False
+def resize(image, size):
+    image_pil = Image.fromarray(image.astype('uint8'), 'L')
+    image_pil = image_pil.resize(size)
+    return np.array(image_pil)
+def get_author_ids(base_folder: str):
+    with open(os.path.join(base_folder, "gan.iam.tr_va.gt.filter27"), 'r') as f:
+        training_authors = [line.split(",")[0] for line in f]
+    training_authors = set(training_authors)
+    with open(os.path.join(base_folder, "gan.iam.test.gt.filter27"), 'r') as f:
+        test_authors = [line.split(",")[0] for line in f]
+    test_authors = set(test_authors)
+    assert len(training_authors.intersection(test_authors)) == 0
+    return training_authors, test_authors
+class IAMImage:
+    def __init__(self, image: np.array, label: str, image_id: int, line_id: str, bbox: list = None, iam_image_id: str = None):
+        self.image = image
+        self.label = label
+        self.image_id = image_id
+        self.line_id = line_id
+        self.iam_image_id = iam_image_id
+        self.has_bbox = False
+        if bbox is not None:
+            self.has_bbox = True
+            self.x, self.y, self.w, self.h = bbox
+    def merge(self, other: 'IAMImage'):
+        global MERGER_COUNT
+        assert self.has_bbox, "IAM image has no bounding box information"
+        y = min(self.y, other.y)
+        h = max(other.y + other.h, self.y + self.h) - y
+        x = min(self.x, other.x)
+        w = max(self.x + self.w, other.x + other.w) - x
+        new_image = np.ones((h, w), dtype=self.image.dtype) * 255
+        anchor_x = self.x - x
+        anchor_y = self.y - y
+        new_image[anchor_y:anchor_y + self.h, anchor_x:anchor_x + self.w] = self.image
+        anchor_x = other.x - x
+        anchor_y = other.y - y
+        new_image[anchor_y:anchor_y + other.h, anchor_x:anchor_x + other.w] = other.image
+        if other.x - (self.x + self.w) > 50:
+            new_label = self.label + " " + other.label
+        else:
+            new_label = self.label + other.label
+        new_id = self.image_id
+        new_bbox = [x, y, w, h]
+        new_iam_image_id = self.iam_image_id if len(self.label) > len(other.label) else other.iam_image_id
+        return IAMImage(new_image, new_label, new_id, self.line_id, new_bbox, iam_image_id=new_iam_image_id)
+def read_iam_lines(base_folder: str) -> dict:
+    form_to_author = {}
+    with open(os.path.join(base_folder, "forms.txt"), 'r') as f:
+        for line in f:
+            if not line.startswith("#"):
+                form, author, *_ = line.split(" ")
+                form_to_author[form] = author
+    training_authors, test_authors = get_author_ids(base_folder)
+    dataset_dict = {
+        'train': defaultdict(list),
+        'test': defaultdict(list),
+        'other': defaultdict(list)
+    }
+    image_count = 0
+    with open(os.path.join(base_folder, "sentences.txt"), 'r') as f:
+        for line in f:
+            if not line.startswith("#"):
+                line_id, _, ok, *_, label = line.rstrip().split(" ")
+                form_id = "-".join(line_id.split("-")[:2])
+                author_id = form_to_author[form_id]
+                if ok != 'ok' and FILTER_ERR:
+                    continue
+                line_label = ""
+                for word in label.split("|"):
+                    if not(len(line_label) == 0 or word in [".", ","]):
+                        line_label += " "
+                    line_label += word
+                image_path = os.path.join(base_folder, "sentences", form_id.split("-")[0], form_id, f"{line_id}.png")
+                subset = 'other'
+                if author_id in training_authors:
+                    subset = 'train'
+                elif author_id in test_authors:
+                    subset = 'test'
+                im = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+                if im is not None and im.size > 1:
+                    dataset_dict[subset][author_id].append(IAMImage(
+                        im, line_label, image_count, line_id, None
+                    ))
+                    image_count += 1
+    return dataset_dict
+def read_iam(base_folder: str) -> dict:
+    with open(os.path.join(base_folder, "forms.txt"), 'r') as f:
+        forms = [line.rstrip() for line in f if not line.startswith("#")]
+    training_authors, test_authors = get_author_ids(base_folder)
+    image_info = {}
+    with open(os.path.join(base_folder, "words.txt"), 'r') as f:
+        for line in f:
+            if not line.startswith("#"):
+                image_id, ok, threshold, x, y, w, h, tag, *content = line.rstrip().split(" ")
+                image_info[image_id] = {
+                    'ok': ok == 'ok',
+                    'threshold': threshold,
+                    'content': " ".join(content) if isinstance(content, list) else content,
+                    'bbox': [int(x), int(y), int(w), int(h)]
+                }
+    dataset_dict = {
+        'train': defaultdict(list),
+        'test': defaultdict(list),
+        'other': defaultdict(list)
+    }
+    image_count = 0
+    err_count = 0
+    for form in forms:
+        form_id, writer_id, *_ = form.split(" ")
+        base_form = form_id.split("-")[0]
+        form_path = os.path.join(base_folder, "words", base_form, form_id)
+        for image_name in os.listdir(form_path):
+            image_id = image_name.split(".")[0]
+            info = image_info[image_id]
+            subset = 'other'
+            if writer_id in training_authors:
+                subset = 'train'
+            elif writer_id in test_authors:
+                subset = 'test'
+            if info['ok'] or not FILTER_ERR:
+                im = cv2.imread(os.path.join(form_path, image_name), cv2.IMREAD_GRAYSCALE)
+                if not info['ok'] and False:
+                    cv2.destroyAllWindows()
+                    print(info['content'])
+                    cv2.imshow("image", im)
+                    cv2.waitKey(0)
+                if im is not None and im.size > 1:
+                    dataset_dict[subset][writer_id].append(IAMImage(
+                        im, info['content'], image_count, "-".join(image_id.split("-")[:3]), info['bbox'], iam_image_id=image_id
+                    ))
+                    image_count += 1
+                else:
+                    err_count += 1
+                    print(f"Could not read image {image_name}, skipping")
+            else:
+                err_count += 1
+    assert not dataset_dict['train'].keys() & dataset_dict['test'].keys(), "Training and Testing set have common authors"
+    print(f"Skipped images: {err_count}")
+    return dataset_dict
+def read_cvl_set(set_folder: str):
+    set_images = defaultdict(list)
+    words_path = os.path.join(set_folder, "words")
+    image_id = 0
+    for author_id in os.listdir(words_path):
+        author_path = os.path.join(words_path, author_id)
+        for image_file in os.listdir(author_path):
+            label = image_file.split("-")[-1].split(".")[0]
+            line_id = "-".join(image_file.split("-")[:-2])
+            stream = open(os.path.join(author_path, image_file), "rb")
+            bytes = bytearray(stream.read())
+            numpyarray = np.asarray(bytes, dtype=np.uint8)
+            image = cv2.imdecode(numpyarray, cv2.IMREAD_UNCHANGED)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            if image is not None and image.size > 1:
+                set_images[int(author_id)].append(IAMImage(image, label, image_id, line_id))
+                image_id += 1
+    return set_images
+def read_cvl(base_folder: str):
+    dataset_dict = {
+        'test': read_cvl_set(os.path.join(base_folder, 'testset')),
+        'train': read_cvl_set(os.path.join(base_folder, 'trainset'))
+    }
+    assert not dataset_dict['train'].keys() & dataset_dict[
+        'test'].keys(), "Training and Testing set have common authors"
+    return dataset_dict
+def pad_top(image: np.array, height: int) -> np.array:
+    result = np.ones((height, image.shape[1]), dtype=np.uint8) * 255
+    result[height - image.shape[0]:, :image.shape[1]] = image
+    return result
+def scale_per_writer(writer_dict: dict, target_height: int, char_width: int = None) -> dict:
+    for author_id in writer_dict.keys():
+        max_height = max([image_dict.image.shape[0] for image_dict in writer_dict[author_id]])
+        scale_y = target_height / max_height
+        for image_dict in writer_dict[author_id]:
+            image = image_dict.image
+            scale_x = scale_y if char_width is None else len(image_dict.label) * char_width / image_dict.image.shape[1]
+            #image = cv2.resize(image, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_CUBIC)
+            image = resize(image, (int(image.shape[1] * scale_x), int(image.shape[0] * scale_y)))
+            image_dict.image = pad_top(image, target_height)
+    return writer_dict
+def scale_images(writer_dict: dict, target_height: int, char_width: int = None) -> dict:
+    for author_id in writer_dict.keys():
+        for image_dict in writer_dict[author_id]:
+            scale_y = target_height / image_dict.image.shape[0]
+            scale_x = scale_y if char_width is None else len(image_dict.label) * char_width / image_dict.image.shape[1]
+            #image_dict.image = cv2.resize(image_dict.image, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_CUBIC)
+            image_dict.image = resize(image_dict.image, (int(image_dict.image.shape[1] * scale_x), target_height))
+    return writer_dict
+def scale_word_width(writer_dict: dict):
+    for author_id in writer_dict.keys():
+        for image_dict in writer_dict[author_id]:
+            width = len(image_dict.label) * (image_dict.image.shape[0] / 2.0)
+            image_dict.image = resize(image_dict.image, (int(width), image_dict.image.shape[0]))
+    return writer_dict
+def get_sentences(author_dict: dict):
+    collected = defaultdict(list)
+    for image in author_dict:
+        collected[image.line_id].append(image)
+    return [v for k, v in collected.items()]
+def merge_author_words(author_words):
+    def try_left_merge(index: int):
+        if index > 0 and author_words[index - 1].line_id == author_words[index].line_id and not to_remove[index - 1] and not author_words[index - 1].label in TO_MERGE.keys():
+            merged = author_words[index - 1].merge(author_words[index])
+            author_words[index - 1] = merged
+            to_remove[index] = True
+            return True
+        return False
+    def try_right_merge(index: int):
+        if index < len(author_words) - 1 and author_words[index].line_id == author_words[index + 1].line_id and not to_remove[index + 1] and not author_words[index + 1].label in TO_MERGE.keys():
+            merged = iam_image.merge(author_words[index + 1])
+            author_words[index + 1] = merged
+            to_remove[index] = True
+            return True
+        return False
+    to_remove = [False for _ in range(len(author_words))]
+    for i in range(len(author_words)):
+        iam_image = author_words[i]
+        if iam_image.label in TO_MERGE.keys():
+            merge_type = TO_MERGE[iam_image.label] if TO_MERGE[iam_image.label] != 'random' else random.choice(['left', 'right'])
+            if merge_type == 'left':
+                if not try_left_merge(i):
+                    if not try_right_merge(i):
+                        print(f"Could not merge char: {iam_image.label}")
+            else:
+                if not try_right_merge(i):
+                    if not try_left_merge(i):
+                        print(f"Could not merge char: {iam_image.label}")
+    return [image for image, remove in zip(author_words, to_remove) if not remove], sum(to_remove)
+def merge_punctuation(writer_dict: dict) -> dict:
+    for author_id in writer_dict.keys():
+        author_dict = writer_dict[author_id]
+        merged = 1
+        while merged > 0:
+            author_dict, merged = merge_author_words(author_dict)
+        writer_dict[author_id] = author_dict
+    return writer_dict
+def filter_punctuation(writer_dict: dict) -> dict:
+    for author_id in writer_dict.keys():
+        author_list = [im for im in writer_dict[author_id] if im.label not in TO_MERGE.keys()]
+        writer_dict[author_id] = author_list
+    return writer_dict
+def filter_by_width(writer_dict: dict, target_height: int = 32, min_width: int = 16, max_width: int = 17) -> dict:
+    def is_valid(iam_image: IAMImage) -> bool:
+        target_width = (target_height / iam_image.image.shape[0]) * iam_image.image.shape[1]
+        if len(iam_image.label) * min_width / 3 <= target_width <= len(iam_image.label) * max_width * 3:
+            return True
+        else:
+            return False
+    for author_id in writer_dict.keys():
+        author_list = [im for im in writer_dict[author_id] if is_valid(im)]
+        writer_dict[author_id] = author_list
+    return writer_dict
+def write_data(dataset_dict: dict, location: str, height, punct_mode: str = 'none', author_scale: bool = False, uniform_char_width: bool = False):
+    assert punct_mode in ['none', 'filter', 'merge']
+    result = {}
+    for key in dataset_dict.keys():
+        result[key] = {}
+        subset_dict = dataset_dict[key]
+        subset_dict = filter_by_width(subset_dict)
+        if punct_mode == 'merge':
+            subset_dict = merge_punctuation(subset_dict)
+        elif punct_mode == 'filter':
+            subset_dict = filter_punctuation(subset_dict)
+        char_width = 16 if uniform_char_width else None
+        if author_scale:
+            subset_dict = scale_per_writer(subset_dict, height, char_width)
+        else:
+            subset_dict = scale_images(subset_dict, height, char_width)
+        for author_id in subset_dict:
+            author_images = []
+            for image_dict in subset_dict[author_id]:
+                author_images.append({
+                    'img': PIL.Image.fromarray(image_dict.image),
+                    'label': image_dict.label,
+                    'image_id': image_dict.image_id,
+                    'original_image_id': image_dict.iam_image_id
+                })
+            result[key][author_id] = author_images
+    with open(location, 'wb') as f:
+        pickle.dump(result, f)
+def write_fid(dataset_dict: dict, location: str):
+    data = dataset_dict['test']
+    data = scale_images(data, 64, None)
+    for author in data.keys():
+        author_folder = os.path.join(location, author)
+        os.mkdir(author_folder)
+        count = 0
+        for image in data[author]:
+            img = image.image
+            cv2.imwrite(os.path.join(author_folder, f"{count}.png"), img.squeeze().astype(np.uint8))
+            count += 1
+def write_images_per_author(dataset_dict: dict, output_file: str):
+    data = dataset_dict["test"]
+    result = {}
+    for author in data.keys():
+        author_images = [image.iam_image_id for image in data[author]]
+        result[author] = author_images
+    with open(output_file, 'w') as f:
+        json.dump(result, f)
+def write_words(dataset_dict: dict, output_file):
+    data = dataset_dict['train']
+    all_words = []
+    for author in data.keys():
+        all_words.extend([image.label for image in data[author]])
+    with open(output_file, 'w') as f:
+        for word in all_words:
+            f.write(f"{word}\n")
+if __name__ == "__main__":
+    data_path = r"D:\Datasets\IAM"
+    fid_location = r"E:/projects/evaluation/shtg_interface/data/reference_imgs/h64/iam"
+    height = 32
+    data_collection = {}
+    output_location = r"E:\projects\evaluation\shtg_interface\data\datasets"
+    data = read_iam(data_path)
+    test_data = dict(scale_word_width(data['test']))
+    train_data = dict(scale_word_width(data['train']))
+    test_data.update(train_data)
+    for key, value in test_data.items():
+        for image_object in value:
+            if len(image_object.label) <= 0 or image_object.image.size == 0:
+                continue
+            data_collection[image_object.iam_image_id] = {
+                'img': image_object.image,
+                'lbl': image_object.label,
+                'author_id': key
+            }
+    with gzip.open(os.path.join(output_location, f"iam_w16_words_data.pkl.gz"), 'wb') as f:
+        pickle.dump(data_collection, f)

data/dataset.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import random
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+import os
+import pickle
+import numpy as np
+from PIL import Image
+from pathlib import Path
+def get_dataset_path(dataset_name, height, file_suffix, datasets_path):
+    if file_suffix is not None:
+        filename = f'{dataset_name}-{height}-{file_suffix}.pickle'
+    else:
+        filename = f'{dataset_name}-{height}.pickle'
+    return os.path.join(datasets_path, filename)
+def get_transform(grayscale=False, convert=True):
+    transform_list = []
+    if grayscale:
+        transform_list.append(transforms.Grayscale(1))
+    if convert:
+        transform_list += [transforms.ToTensor()]
+        if grayscale:
+            transform_list += [transforms.Normalize((0.5,), (0.5,))]
+        else:
+            transform_list += [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    return transforms.Compose(transform_list)
+class TextDataset:
+    def __init__(self, base_path, collator_resolution, num_examples=15, target_transform=None, min_virtual_size=0, validation=False, debug=False):
+        self.NUM_EXAMPLES = num_examples
+        self.debug = debug
+        self.min_virtual_size = min_virtual_size
+        subset = 'test' if validation else 'train'
+        # base_path=DATASET_PATHS
+        file_to_store = open(base_path, "rb")
+        self.IMG_DATA = pickle.load(file_to_store)[subset]
+        self.IMG_DATA = dict(list(self.IMG_DATA.items()))  # [:NUM_WRITERS])
+        if 'None' in self.IMG_DATA.keys():
+            del self.IMG_DATA['None']
+        self.alphabet = ''.join(sorted(set(''.join(d['label'] for d in sum(self.IMG_DATA.values(), [])))))
+        self.author_id = list(self.IMG_DATA.keys())
+        self.transform = get_transform(grayscale=True)
+        self.target_transform = target_transform
+        self.collate_fn = TextCollator(collator_resolution)
+    def __len__(self):
+        if self.debug:
+            return 16
+        return max(len(self.author_id), self.min_virtual_size)
+    @property
+    def num_writers(self):
+        return len(self.author_id)
+    def __getitem__(self, index):
+        index = index % len(self.author_id)
+        author_id = self.author_id[index]
+        self.IMG_DATA_AUTHOR = self.IMG_DATA[author_id]
+        random_idxs = random.choices([i for i in range(len(self.IMG_DATA_AUTHOR))], k=self.NUM_EXAMPLES)
+        word_data = random.choice(self.IMG_DATA_AUTHOR)
+        real_img = self.transform(word_data['img'].convert('L'))
+        real_labels = word_data['label'].encode()
+        imgs = [np.array(self.IMG_DATA_AUTHOR[idx]['img'].convert('L')) for idx in random_idxs]
+        slabels = [self.IMG_DATA_AUTHOR[idx]['label'].encode() for idx in random_idxs]
+        max_width = 192  # [img.shape[1] for img in imgs]
+        imgs_pad = []
+        imgs_wids = []
+        for img in imgs:
+            img_height, img_width = img.shape[0], img.shape[1]
+            output_img = np.ones((img_height, max_width), dtype='float32') * 255.0
+            output_img[:, :img_width] = img[:, :max_width]
+            imgs_pad.append(self.transform(Image.fromarray(output_img.astype(np.uint8))))
+            imgs_wids.append(img_width)
+        imgs_pad = torch.cat(imgs_pad, 0)
+        item = {
+            'simg': imgs_pad,   # N images (15) that come from the same author [N (15), H (32), MAX_W (192)]
+            'swids': imgs_wids, # widths of the N images [list(N)]
+            'img': real_img,  # the input image [1, H (32), W]
+            'label': real_labels,  # the label of the input image [byte]
+            'img_path': 'img_path',
+            'idx': 'indexes',
+            'wcl': index,  # id of the author [int],
+            'slabels': slabels,
+            'author_id': author_id
+        }
+        return item
+    def get_stats(self):
+        char_counts = defaultdict(lambda: 0)
+        total = 0
+        for author in self.IMG_DATA.keys():
+            for data in self.IMG_DATA[author]:
+                for char in data['label']:
+                    char_counts[char] += 1
+                    total += 1
+        char_counts = {k: 1.0 / (v / total) for k, v in char_counts.items()}
+        return char_counts
+class TextCollator(object):
+    def __init__(self, resolution):
+        self.resolution = resolution
+    def __call__(self, batch):
+        if isinstance(batch[0], list):
+            batch = sum(batch, [])
+        img_path = [item['img_path'] for item in batch]
+        width = [item['img'].shape[2] for item in batch]
+        indexes = [item['idx'] for item in batch]
+        simgs = torch.stack([item['simg'] for item in batch], 0)
+        wcls = torch.Tensor([item['wcl'] for item in batch])
+        swids = torch.Tensor([item['swids'] for item in batch])
+        imgs = torch.ones([len(batch), batch[0]['img'].shape[0], batch[0]['img'].shape[1], max(width)],
+                          dtype=torch.float32)
+        for idx, item in enumerate(batch):
+            try:
+                imgs[idx, :, :, 0:item['img'].shape[2]] = item['img']
+            except:
+                print(imgs.shape)
+        item = {'img': imgs, 'img_path': img_path, 'idx': indexes, 'simg': simgs, 'swids': swids, 'wcl': wcls}
+        if 'label' in batch[0].keys():
+            labels = [item['label'] for item in batch]
+            item['label'] = labels
+        if 'slabels' in batch[0].keys():
+            slabels = [item['slabels'] for item in batch]
+            item['slabels'] = np.array(slabels)
+        if 'z' in batch[0].keys():
+            z = torch.stack([item['z'] for item in batch])
+            item['z'] = z
+        return item
+class CollectionTextDataset(Dataset):
+    def __init__(self, datasets, datasets_path, dataset_class, file_suffix=None, height=32, **kwargs):
+        self.datasets = {}
+        for dataset_name in sorted(datasets.split(',')):
+            dataset_file = get_dataset_path(dataset_name, height, file_suffix, datasets_path)
+            dataset = dataset_class(dataset_file, **kwargs)
+            self.datasets[dataset_name] = dataset
+        self.alphabet = ''.join(sorted(set(''.join(d.alphabet for d in self.datasets.values()))))
+    def __len__(self):
+        return sum(len(d) for d in self.datasets.values())
+    @property
+    def num_writers(self):
+        return sum(d.num_writers for d in self.datasets.values())
+    def __getitem__(self, index):
+        for dataset in self.datasets.values():
+            if index < len(dataset):
+                return dataset[index]
+            index -= len(dataset)
+        raise IndexError
+    def get_dataset(self, index):
+        for dataset_name, dataset in self.datasets.items():
+            if index < len(dataset):
+                return dataset_name
+            index -= len(dataset)
+        raise IndexError
+    def collate_fn(self, batch):
+        return self.datasets[self.get_dataset(0)].collate_fn(batch)
+class FidDataset(Dataset):
+    def __init__(self, base_path, collator_resolution, num_examples=15, target_transform=None, mode='train', style_dataset=None):
+        self.NUM_EXAMPLES = num_examples
+        # base_path=DATASET_PATHS
+        with open(base_path, "rb") as f:
+            self.IMG_DATA = pickle.load(f)
+        self.IMG_DATA = self.IMG_DATA[mode]
+        if 'None' in self.IMG_DATA.keys():
+            del self.IMG_DATA['None']
+        self.STYLE_IMG_DATA = None
+        if style_dataset is not None:
+            with open(style_dataset, "rb") as f:
+                self.STYLE_IMG_DATA = pickle.load(f)
+                self.STYLE_IMG_DATA = self.STYLE_IMG_DATA[mode]
+                if 'None' in self.STYLE_IMG_DATA.keys():
+                    del self.STYLE_IMG_DATA['None']
+        self.alphabet = ''.join(sorted(set(''.join(d['label'] for d in sum(self.IMG_DATA.values(), [])))))
+        self.author_id = sorted(self.IMG_DATA.keys())
+        self.transform = get_transform(grayscale=True)
+        self.target_transform = target_transform
+        self.dataset_size = sum(len(samples) for samples in self.IMG_DATA.values())
+        self.collate_fn = TextCollator(collator_resolution)
+    def __len__(self):
+        return self.dataset_size
+    @property
+    def num_writers(self):
+        return len(self.author_id)
+    def __getitem__(self, index):
+        NUM_SAMPLES = self.NUM_EXAMPLES
+        sample, author_id = None, None
+        for author_id, samples in self.IMG_DATA.items():
+            if index < len(samples):
+                sample, author_id = samples[index], author_id
+                break
+            index -= len(samples)
+        real_image = self.transform(sample['img'].convert('L'))
+        real_label = sample['label'].encode()
+        style_dataset = self.STYLE_IMG_DATA if self.STYLE_IMG_DATA is not None else self.IMG_DATA
+        author_style_images = style_dataset[author_id]
+        random_idxs = np.random.choice(len(author_style_images), NUM_SAMPLES, replace=True)
+        style_images = [np.array(author_style_images[idx]['img'].convert('L')) for idx in random_idxs]
+        max_width = 192
+        imgs_pad = []
+        imgs_wids = []
+        for img in style_images:
+            img = 255 - img
+            img_height, img_width = img.shape[0], img.shape[1]
+            outImg = np.zeros((img_height, max_width), dtype='float32')
+            outImg[:, :img_width] = img[:, :max_width]
+            img = 255 - outImg
+            imgs_pad.append(self.transform(Image.fromarray(img.astype(np.uint8))))
+            imgs_wids.append(img_width)
+        imgs_pad = torch.cat(imgs_pad, 0)
+        item = {
+            'simg': imgs_pad,  # widths of the N images [list(N)]
+            'swids': imgs_wids,  # N images (15) that come from the same author [N (15), H (32), MAX_W (192)]
+            'img': real_image,  # the input image [1, H (32), W]
+            'label': real_label,  # the label of the input image [byte]
+            'img_path': 'img_path',
+            'idx': sample['img_id'] if 'img_id' in sample.keys() else sample['image_id'],
+            'wcl': int(author_id)  # id of the author [int]
+        }
+        return item
+class FolderDataset:
+    def __init__(self, folder_path, num_examples=15, word_lengths=None):
+        folder_path = Path(folder_path)
+        self.imgs = list([p for p in folder_path.iterdir() if not p.suffix == '.txt'])
+        self.transform = get_transform(grayscale=True)
+        self.num_examples = num_examples
+        self.word_lengths = word_lengths
+    def __len__(self):
+        return len(self.imgs)
+    def sample_style(self):
+        random_idxs = np.random.choice(len(self.imgs), self.num_examples, replace=False)
+        image_names = [self.imgs[idx].stem for idx in random_idxs]
+        imgs = [Image.open(self.imgs[idx]).convert('L') for idx in random_idxs]
+        if self.word_lengths is None:
+            imgs = [img.resize((img.size[0] * 32 // img.size[1], 32), Image.BILINEAR) for img in imgs]
+        else:
+            imgs = [img.resize((self.word_lengths[name] * 16, 32), Image.BILINEAR) for img, name in zip(imgs, image_names)]
+        imgs = [np.array(img) for img in imgs]
+        max_width = 192  # [img.shape[1] for img in imgs]
+        imgs_pad = []
+        imgs_wids = []
+        for img in imgs:
+            img = 255 - img
+            img_height, img_width = img.shape[0], img.shape[1]
+            outImg = np.zeros((img_height, max_width), dtype='float32')
+            outImg[:, :img_width] = img[:, :max_width]
+            img = 255 - outImg
+            imgs_pad.append(self.transform(Image.fromarray(img.astype(np.uint8))))
+            imgs_wids.append(img_width)
+        imgs_pad = torch.cat(imgs_pad, 0)
+        item = {
+            'simg': imgs_pad,  # widths of the N images [list(N)]
+            'swids': imgs_wids,  # N images (15) that come from the same author [N (15), H (32), MAX_W (192)]
+        }
+        return item

data/iam_test.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+def test_split():
+    iam_path = r"C:\Users\bramv\Documents\Werk\Research\Unimore\datasets\IAM"
+    original_set_names = ["trainset.txt", "validationset1.txt", "validationset2.txt", "testset.txt"]
+    original_set_ids = []
+    print("ORIGINAL IAM")
+    print("---------------------")
+    for set_name in original_set_names:
+        with open(os.path.join(iam_path, set_name), 'r') as f:
+            set_form_ids = ["-".join(l.rstrip().split("-")[:-1]) for l in f]
+        form_to_id = {}
+        with open(os.path.join(iam_path, "forms.txt"), 'r') as f:
+            for line in f:
+                if line.startswith("#"):
+                    continue
+                form, id, *_ = line.split(" ")
+                assert form not in form_to_id.keys() or form_to_id[form] == id
+                form_to_id[form] = int(id)
+        set_authors = [form_to_id[form] for form in set_form_ids]
+        set_authors = set(sorted(set_authors))
+        original_set_ids.append(set_authors)
+        print(f"{set_name} count: {len(set_authors)}")
+    htg_set_names = ["gan.iam.tr_va.gt.filter27", "gan.iam.test.gt.filter27"]
+    print("\n\nHTG IAM")
+    print("---------------------")
+    for set_name in htg_set_names:
+        with open(os.path.join(iam_path, set_name), 'r') as f:
+            set_authors = [int(l.split(",")[0]) for l in f]
+        set_authors = set(set_authors)
+        print(f"{set_name} count: {len(set_authors)}")
+        for name, original_set in zip(original_set_names, original_set_ids):
+            intr = set_authors.intersection(original_set)
+            print(f"\t intersection with {name}: {len(intr)}")
+if __name__ == "__main__":
+    test_split()

data/show_dataset.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import pickle
+import random
+import shutil
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from data.dataset import get_transform
+def summarize_dataset(data: dict):
+    print(f"Training authors: {len(data['train'].keys())} \t Testing authors: {len(data['test'].keys())}")
+    training_images = sum([len(data['train'][k]) for k in data['train'].keys()])
+    testing_images = sum([len(data['test'][k]) for k in data['test'].keys()])
+    print(f"Training images: {training_images} \t Testing images: {testing_images}")
+def compare_data(path_a: str, path_b: str):
+    with open(path_a, 'rb') as f:
+        data_a = pickle.load(f)
+        summarize_dataset(data_a)
+    with open(path_b, 'rb') as f:
+        data_b = pickle.load(f)
+        summarize_dataset(data_b)
+    training_a = data_a['train']
+    training_b = data_b['train']
+    training_a = {int(k): v for k, v in training_a.items()}
+    training_b = {int(k): v for k, v in training_b.items()}
+    while True:
+        author = random.choice(list(training_a.keys()))
+        if author in training_b.keys():
+            author_images_a = [np.array(im_dict["img"]) for im_dict in training_a[author]]
+            author_images_b = [np.array(im_dict["img"]) for im_dict in training_b[author]]
+            labels_a = [str(im_dict["label"]) for im_dict in training_a[author]]
+            labels_b = [str(im_dict["label"]) for im_dict in training_b[author]]
+            vis_a = np.hstack(author_images_a[:10])
+            vis_b = np.hstack(author_images_b[:10])
+            cv2.imshow("Author a", vis_a)
+            cv2.imshow("Author b", vis_b)
+            cv2.waitKey(0)
+        else:
+            print(f"Author: {author} not found in second dataset")
+def show_dataset(path: str, samples: int = 10):
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+        summarize_dataset(data)
+    training = data['train']
+    author = training['013']
+    author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in author]
+    for img in author_images:
+        cv2.imshow('image', img)
+        cv2.waitKey(0)
+    for author in list(training.keys()):
+        author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in training[author]]
+        labels = [str(im_dict["label"]) for im_dict in training[author]]
+        vis = np.hstack(author_images[:samples])
+        print(f"Author: {author}")
+        cv2.destroyAllWindows()
+        cv2.imshow("vis", vis)
+        cv2.waitKey(0)
+def test_transform(path: str):
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+        summarize_dataset(data)
+    training = data['train']
+    transform = get_transform(grayscale=True)
+    for author_id in training.keys():
+        author = training[author_id]
+        for image_dict in author:
+            original_image = image_dict['img'].convert('L')
+            transformed_image = transform(original_image).detach().numpy()
+            restored_image = (((transformed_image + 1) / 2) * 255).astype(np.uint8)
+            restored_image = np.squeeze(restored_image)
+            original_image = np.array(original_image)
+            wrong_pixels = (original_image != restored_image).astype(np.uint8) * 255
+            combined = np.hstack((restored_image, original_image, wrong_pixels))
+            cv2.imshow("original", original_image)
+            cv2.imshow("restored", restored_image)
+            cv2.imshow("combined", combined)
+            f, ax = plt.subplots(1, 2)
+            ax[0].hist(original_image.flatten())
+            ax[1].hist(restored_image.flatten())
+            plt.show()
+            cv2.waitKey(0)
+def dump_words():
+    data_path = r"..\files\IAM-32.pickle"
+    p_mark = 'point'
+    p = '.'
+    with open(data_path, 'rb') as f:
+        data = pickle.load(f)
+    training = data['train']
+    target_folder = f"../saved_images/debug/{p_mark}"
+    if os.path.exists(target_folder):
+        shutil.rmtree(target_folder)
+    os.mkdir(target_folder)
+    count = 0
+    for author in list(training.keys()):
+        author_images = [np.array(im_dict["img"]).astype(np.uint8) for im_dict in training[author]]
+        labels = [str(im_dict["label"]) for im_dict in training[author]]
+        for img, label in zip(author_images, labels):
+            if p in label:
+                cv2.imwrite(os.path.join(target_folder, f"{count}.png"), img)
+                count += 1
+if __name__ == "__main__":
+    test_transform("../files/IAM-32.pickle")
+    #show_dataset("../files/IAM-32.pickle")
+    #compare_data(r"../files/IAM-32.pickle", r"../files/_IAM-32.pickle")

files/IAM-32-pa.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92bff8330e8f404b5f382846266257b5cac45d6c27908df5c3ee7d0c77a0ee95
+size 245981914

files/IAM-32.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c56d4055470c26a30dbbdf7f2e232eb86ffc714b803651dbac5576ee2bc97937
+size 590113103

files/cvl_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b47fe3ffe291bb3e52db0643125a99206840884181ed21312bcbe2cdd86303f0
+size 163050271

files/english_words.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

files/files ADDED Viewed

	@@ -0,0 +1 @@


1	+ files

files/hwt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:999f85148e34e30242c1aa9ed7063c9dbc9da008f868ed26cb6ed923f9d8c0bd
+size 163050271

files/resnet_18_pretrained.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf5f5f6a94152dc4b0e9f2e390d658ef621efead3824cd494d3a82a6c8ceb5e0
+size 48833885

files/unifont.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0804979068f0d169b343fbe0fe8d7ff478165d07a671fcf52e20f625db8e7f9f
+size 16978300

files/vatr.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65b67f1738bf74d5bf612f7f35e2c8c9560568d7efe422beb9132e1bb68bbef8
+size 565758212

files/vatrpp.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c02f950d19cf3df3cfa6fe97114557e16a51bd3b910da6b5a2359a29851b84b6
+size 561198056

generate.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import argparse
+from generate import generate_text, generate_authors, generate_fid, generate_page, generate_ocr, generate_ocr_msgpack
+from generate.ocr import generate_ocr_reference
+from util.misc import add_vatr_args
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("action", choices=['text', 'fid', 'page', 'authors', 'ocr'])
+    parser.add_argument("-s", "--style-folder", default='files/style_samples/00', type=str)
+    parser.add_argument("-t", "--text", default='That\'s one small step for man, one giant leap for mankind ΑαΒβΓγΔδ', type=str)
+    parser.add_argument("--text-path", default=None, type=str, help='Path to text file with texts to generate')
+    parser.add_argument("-c", "--checkpoint", default='files/vatr.pth', type=str)
+    parser.add_argument("-o", "--output", default=None, type=str)
+    parser.add_argument("--count", default=1000, type=int)
+    parser.add_argument("-a", "--align", action='store_true')
+    parser.add_argument("--at-once", action='store_true')
+    parser.add_argument("--output-style", action='store_true')
+    parser.add_argument("-d", "--dataset-path", type=str)
+    parser.add_argument("--target-dataset-path", type=str, default=None)
+    parser.add_argument("--charset-file", type=str, default=None)
+    parser.add_argument("--interp-styles", action='store_true')
+    parser.add_argument("--test-only", action='store_true')
+    parser.add_argument("--fake-only", action='store_true')
+    parser.add_argument("--all-epochs", action='store_true')
+    parser.add_argument("--long-tail", action='store_true')
+    parser.add_argument("--msgpack", action='store_true')
+    parser.add_argument("--reference", action='store_true')
+    parser.add_argument("--test-set", action='store_true')
+    parser = add_vatr_args(parser)
+    args = parser.parse_args()
+    if args.action == 'text':
+        generate_text(args)
+    elif args.action == 'authors':
+        generate_authors(args)
+    elif args.action == 'fid':
+        generate_fid(args)
+    elif args.action == 'page':
+        generate_page(args)
+    elif args.action == 'ocr':
+        if args.msgpack:
+            generate_ocr_msgpack(args)
+        elif args.reference:
+            generate_ocr_reference(args)
+        else:
+            generate_ocr(args)

generate/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from generate.text import generate_text
+from generate.fid import generate_fid
+from generate.authors import generate_authors
+from generate.page import generate_page
+from generate.ocr import generate_ocr, generate_ocr_msgpack

generate/authors.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import shutil
+import cv2
+import numpy as np
+from data.dataset import CollectionTextDataset, TextDataset
+from generate.util import stack_lines
+from generate.writer import Writer
+def generate_authors(args):
+    dataset = CollectionTextDataset(
+        args.dataset, 'files', TextDataset, file_suffix=args.file_suffix, num_examples=args.num_examples,
+        collator_resolution=args.resolution, validation=args.test_set
+    )
+    args.num_writers = dataset.num_writers
+    writer = Writer(args.checkpoint, args, only_generator=True)
+    if args.text.endswith(".txt"):
+        with open(args.text, 'r') as f:
+            lines = [l.rstrip() for l in f]
+    else:
+        lines = [args.text]
+    output_dir = "saved_images/author_samples/"
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.mkdir(output_dir)
+    fakes, author_ids, style_images = writer.generate_authors(lines, dataset, args.align, args.at_once)
+    for fake, author_id, style in zip(fakes, author_ids, style_images):
+        author_dir = os.path.join(output_dir, str(author_id))
+        os.mkdir(author_dir)
+        for i, line in enumerate(fake):
+            cv2.imwrite(os.path.join(author_dir, f"line_{i}.png"), line)
+        total = stack_lines(fake)
+        cv2.imwrite(os.path.join(author_dir, "total.png"), total)
+        if args.output_style:
+            for i, image in enumerate(style):
+                cv2.imwrite(os.path.join(author_dir, f"style_{i}.png"), image)

generate/fid.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+from pathlib import Path
+import torch
+import torch.utils.data
+from data.dataset import FidDataset
+from generate.writer import Writer
+def generate_fid(args):
+    if 'iam' in args.target_dataset_path.lower():
+        args.num_writers = 339
+    elif 'cvl' in args.target_dataset_path.lower():
+        args.num_writers = 283
+    else:
+        raise ValueError
+    args.vocab_size = len(args.alphabet)
+    dataset_train = FidDataset(base_path=args.target_dataset_path, num_examples=args.num_examples, collator_resolution=args.resolution, mode='train', style_dataset=args.dataset_path)
+    train_loader = torch.utils.data.DataLoader(
+        dataset_train,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.num_workers,
+        pin_memory=True, drop_last=False,
+        collate_fn=dataset_train.collate_fn
+    )
+    dataset_test = FidDataset(base_path=args.target_dataset_path, num_examples=args.num_examples, collator_resolution=args.resolution, mode='test', style_dataset=args.dataset_path)
+    test_loader = torch.utils.data.DataLoader(
+        dataset_test,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True, drop_last=False,
+        collate_fn=dataset_test.collate_fn
+    )
+    args.output = 'saved_images' if args.output is None else args.output
+    args.output = Path(args.output) / 'fid' / args.target_dataset_path.split("/")[-1].replace(".pickle", "").replace("-", "")
+    model_folder = args.checkpoint.split("/")[-2] if args.checkpoint.endswith(".pth") else args.checkpoint.split("/")[-1]
+    model_tag = model_folder.split("-")[-1] if "-" in model_folder else "vatr"
+    model_tag += "_" + args.dataset_path.split("/")[-1].replace(".pickle", "").replace("-", "")
+    if not args.all_epochs:
+        writer = Writer(args.checkpoint, args, only_generator=True)
+        if not args.test_only:
+            writer.generate_fid(args.output, train_loader, model_tag=model_tag, split='train', fake_only=args.fake_only, long_tail_only=args.long_tail)
+        writer.generate_fid(args.output, test_loader, model_tag=model_tag, split='test', fake_only=args.fake_only, long_tail_only=args.long_tail)
+    else:
+        epochs = sorted([int(f.split("_")[0]) for f in os.listdir(args.checkpoint) if "_" in f])
+        generate_real = True
+        for epoch in epochs:
+            checkpoint_path = os.path.join(args.checkpoint, f"{str(epoch).zfill(4)}_model.pth")
+            writer = Writer(checkpoint_path, args, only_generator=True)
+            writer.generate_fid(args.output, test_loader, model_tag=f"{model_tag}_{epoch}", split='test', fake_only=not generate_real, long_tail_only=args.long_tail)
+            generate_real = False
+    print('Done')

generate/ocr.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import shutil
+import cv2
+import msgpack
+import torch
+from data.dataset import CollectionTextDataset, TextDataset, FolderDataset, FidDataset, get_dataset_path
+from generate.writer import Writer
+from util.text import get_generator
+def generate_ocr(args):
+    """
+    Generate OCR training data. Words generated are from given text generator.
+    """
+    dataset = CollectionTextDataset(
+        args.dataset, 'files', TextDataset, file_suffix=args.file_suffix, num_examples=args.num_examples,
+        collator_resolution=args.resolution, validation=True
+    )
+    args.num_writers = dataset.num_writers
+    writer = Writer(args.checkpoint, args, only_generator=True)
+    generator = get_generator(args)
+    writer.generate_ocr(dataset, args.count, interpolate_style=args.interp_styles, output_folder=args.output, text_generator=generator)
+def generate_ocr_reference(args):
+    """
+    Generate OCR training data. Words generated are words from given dataset. Reference words are also saved.
+    """
+    dataset = CollectionTextDataset(
+        args.dataset, 'files', TextDataset, file_suffix=args.file_suffix, num_examples=args.num_examples,
+        collator_resolution=args.resolution, validation=True
+    )
+    #dataset = FidDataset(get_dataset_path(args.dataset, 32, args.file_suffix, 'files'), mode='test', collator_resolution=args.resolution)
+    args.num_writers = dataset.num_writers
+    writer = Writer(args.checkpoint, args, only_generator=True)
+    writer.generate_ocr(dataset, args.count, interpolate_style=args.interp_styles, output_folder=args.output, long_tail=args.long_tail)
+def generate_ocr_msgpack(args):
+    """
+    Generate OCR dataset. Words generated are specified in given msgpack file
+    """
+    dataset = FolderDataset(args.dataset_path)
+    args.num_writers = 339
+    if args.charset_file:
+        charset = msgpack.load(open(args.charset_file, 'rb'), use_list=False, strict_map_key=False)
+        args.alphabet = "".join(charset['char2idx'].keys())
+    writer = Writer(args.checkpoint, args, only_generator=True)
+    lines = msgpack.load(open(args.text_path, 'rb'), use_list=False)
+    print(f"Generating {len(lines)} to {args.output}")
+    for i, (filename, target) in enumerate(lines):
+        if not os.path.exists(os.path.join(args.output, filename)):
+            style = torch.unsqueeze(dataset.sample_style()['simg'], dim=0).to(args.device)
+            fake = writer.create_fake_sentence(style, target, at_once=True)
+            cv2.imwrite(os.path.join(args.output, filename), fake)
+    print(f"Done")

generate/page.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import cv2
+import numpy as np
+import torch
+from data.dataset import CollectionTextDataset, TextDataset
+from models.model import VATr
+from util.loading import load_checkpoint, load_generator
+def generate_page(args):
+    args.output = 'vatr' if args.output is None else args.output
+    args.vocab_size = len(args.alphabet)
+    dataset = CollectionTextDataset(
+        args.dataset, 'files', TextDataset, file_suffix=args.file_suffix, num_examples=args.num_examples,
+        collator_resolution=args.resolution
+    )
+    datasetval = CollectionTextDataset(
+        args.dataset, 'files', TextDataset, file_suffix=args.file_suffix, num_examples=args.num_examples,
+        collator_resolution=args.resolution, validation=True
+    )
+    args.num_writers = dataset.num_writers
+    model = VATr(args)
+    checkpoint = torch.load(args.checkpoint, map_location=args.device)
+    model = load_generator(model, checkpoint)
+    train_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=8,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True, drop_last=True,
+        collate_fn=dataset.collate_fn)
+    val_loader = torch.utils.data.DataLoader(
+        datasetval,
+        batch_size=8,
+        shuffle=True,
+        num_workers=0,
+        pin_memory=True, drop_last=True,
+        collate_fn=datasetval.collate_fn)
+    data_train = next(iter(train_loader))
+    data_val = next(iter(val_loader))
+    model.eval()
+    with torch.no_grad():
+        page = model._generate_page(data_train['simg'].to(args.device), data_val['swids'])
+        page_val = model._generate_page(data_val['simg'].to(args.device), data_val['swids'])
+    cv2.imwrite(os.path.join("saved_images", "pages", f"{args.output}_train.png"), (page * 255).astype(np.uint8))
+    cv2.imwrite(os.path.join("saved_images", "pages", f"{args.output}_val.png"), (page_val * 255).astype(np.uint8))

generate/text.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pathlib import Path
+import cv2
+from generate.writer import Writer
+def generate_text(args):
+    if args.text_path is not None:
+        with open(args.text_path, 'r') as f:
+            args.text = f.read()
+    args.text = args.text.splitlines()
+    args.output = 'files/output.png' if args.output is None else args.output
+    args.output = Path(args.output)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.num_writers = 0
+    writer = Writer(args.checkpoint, args, only_generator=True)
+    writer.set_style_folder(args.style_folder)
+    fakes = writer.generate(args.text, args.align)
+    for i, fake in enumerate(fakes):
+        dst_path = args.output.parent / (args.output.stem + f'_{i:03d}' + args.output.suffix)
+        cv2.imwrite(str(dst_path), fake)
+    print('Done')

generate/util.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+def stack_lines(lines: list, h_gap: int = 6):
+    width = max([im.shape[1] for im in lines])
+    height = (lines[0].shape[0] + h_gap) * len(lines)
+    result = np.ones((height, width)) * 255
+    y_pos = 0
+    for line in lines:
+        result[y_pos:y_pos + line.shape[0], 0:line.shape[1]] = line
+        y_pos += line.shape[0] + h_gap
+    return result

generate/writer.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import json
+import os
+import random
+import shutil
+from collections import defaultdict
+import time
+from datetime import timedelta
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from data.dataset import FolderDataset
+from models.model import VATr
+from util.loading import load_checkpoint, load_generator
+from util.misc import FakeArgs
+from util.text import TextGenerator
+from util.vision import detect_text_bounds
+def get_long_tail_chars():
+    with open(f"files/longtail.txt", 'r') as f:
+        chars = [c.rstrip() for c in f]
+    chars.remove('')
+    return chars
+class Writer:
+    def __init__(self, checkpoint_path, args, only_generator: bool = False):
+        self.model = VATr(args)
+        checkpoint = torch.load(checkpoint_path, map_location=args.device)
+        load_checkpoint(self.model, checkpoint) if not only_generator else load_generator(self.model, checkpoint)
+        self.model.eval()
+        self.style_dataset = None
+    def set_style_folder(self, style_folder, num_examples=15):
+        word_lengths = None
+        if os.path.exists(os.path.join(style_folder, "word_lengths.txt")):
+            word_lengths = {}
+            with open(os.path.join(style_folder, "word_lengths.txt"), 'r') as f:
+                for line in f:
+                    word, length = line.rstrip().split(",")
+                    word_lengths[word] = int(length)
+        self.style_dataset = FolderDataset(style_folder, num_examples=num_examples, word_lengths=word_lengths)
+    @torch.no_grad()
+    def generate(self, texts, align_words: bool = False, at_once: bool = False):
+        if isinstance(texts, str):
+            texts = [texts]
+        if self.style_dataset is None:
+            raise Exception('Style is not set')
+        fakes = []
+        for i, text in enumerate(texts, 1):
+            print(f'[{i}/{len(texts)}] Generating for text: {text}')
+            style = self.style_dataset.sample_style()
+            style_images = style['simg'].unsqueeze(0).to(self.model.args.device)
+            fake = self.create_fake_sentence(style_images, text, align_words, at_once)
+            fakes.append(fake)
+        return fakes
+    @torch.no_grad()
+    def create_fake_sentence(self, style_images, text, align_words=False, at_once=False):
+        text = "".join([c for c in text if c in self.model.args.alphabet])
+        text = text.split() if not at_once else [text]
+        gap = np.ones((32, 16))
+        text_encode, len_text, encode_pos = self.model.netconverter.encode(text)
+        text_encode = text_encode.to(self.model.args.device).unsqueeze(0)
+        fake = self.model._generate_fakes(style_images, text_encode, len_text)
+        if not at_once:
+            if align_words:
+                fake = self.stitch_words(fake, show_lines=False)
+            else:
+                fake = np.concatenate(sum([[img, gap] for img in fake], []), axis=1)[:, :-16]
+        else:
+            fake = fake[0]
+        fake = (fake * 255).astype(np.uint8)
+        return fake
+    @torch.no_grad()
+    def generate_authors(self, text, dataset, align_words: bool = False, at_once: bool = False):
+        fakes = []
+        author_ids = []
+        style = []
+        for item in dataset:
+            print(f"Generating author {item['wcl']}")
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            generated_lines = [self.create_fake_sentence(style_images, line, align_words, at_once) for line in text]
+            fakes.append(generated_lines)
+            author_ids.append(item['author_id'])
+            style.append((((item['simg'].numpy() + 1.0) / 2.0) * 255).astype(np.uint8))
+        return fakes, author_ids, style
+    @torch.no_grad()
+    def generate_characters(self, dataset, characters: str):
+        """
+        Generate each of the given characters for each of the authors in the dataset.
+        """
+        fakes = []
+        text_encode, len_text, encode_pos = self.model.netconverter.encode([c for c in characters])
+        text_encode = text_encode.to(self.model.args.device).unsqueeze(0)
+        for item in dataset:
+            print(f"Generating author {item['wcl']}")
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            fake = self.model.netG.evaluate(style_images, text_encode)
+            fakes.append(fake)
+        return fakes
+    @torch.no_grad()
+    def generate_batch(self, style_imgs, text):
+        """
+        Given a batch of style images and text, generate images using the model
+        """
+        device = self.model.args.device
+        text_encode, _, _ = self.model.netconverter.encode(text)
+        fakes, _ = self.model.netG(style_imgs.to(device), text_encode.to(device))
+        return fakes
+    @torch.no_grad()
+    def generate_ocr(self, dataset, number: int, output_folder: str = 'saved_images/ocr', interpolate_style: bool = False, text_generator: TextGenerator = None, long_tail: bool = False):
+        def create_and_write(style, text, interpolated=False):
+            nonlocal image_counter, annotations
+            text_encode, len_text, encode_pos = self.model.netconverter.encode([text])
+            text_encode = text_encode.to(self.model.args.device)
+            fake = self.model.netG.generate(style, text_encode)
+            fake = (fake + 1) / 2
+            fake = fake.cpu().numpy()
+            fake = np.squeeze((fake * 255).astype(np.uint8))
+            image_filename = f"{image_counter}.png" if not interpolated else f"{image_counter}_i.png"
+            cv2.imwrite(os.path.join(output_folder, "generated", image_filename), fake)
+            annotations.append((image_filename, text))
+            image_counter += 1
+        image_counter = 0
+        annotations = []
+        previous_style = None
+        long_tail_chars = get_long_tail_chars()
+        os.mkdir(os.path.join(output_folder, "generated"))
+        if text_generator is None:
+            os.mkdir(os.path.join(output_folder, "reference"))
+        while image_counter < number:
+            author_index = random.randint(0, len(dataset) - 1)
+            item = dataset[author_index]
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            style = self.model.netG.compute_style(style_images)
+            if interpolate_style and previous_style is not None:
+                factor = float(np.clip(random.gauss(0.5, 0.15), 0.0, 1.0))
+                intermediate_style = torch.lerp(previous_style, style, factor)
+                text = text_generator.generate()
+                create_and_write(intermediate_style, text, interpolated=True)
+            if text_generator is not None:
+                text = text_generator.generate()
+            else:
+                text = str(item['label'].decode())
+                if long_tail and not any(c in long_tail_chars for c in text):
+                    continue
+                fake = (item['img'] + 1) / 2
+                fake = fake.cpu().numpy()
+                fake = np.squeeze((fake * 255).astype(np.uint8))
+                image_filename = f"{image_counter}.png"
+                cv2.imwrite(os.path.join(output_folder, "reference", image_filename), fake)
+            create_and_write(style, text)
+            previous_style = style
+        if text_generator is None:
+            with open(os.path.join(output_folder, "reference", "labels.csv"), 'w') as fr:
+                fr.write(f"filename,words\n")
+                for annotation in annotations:
+                    fr.write(f"{annotation[0]},{annotation[1]}\n")
+        with open(os.path.join(output_folder, "generated", "labels.csv"), 'w') as fg:
+            fg.write(f"filename,words\n")
+            for annotation in annotations:
+                fg.write(f"{annotation[0]},{annotation[1]}\n")
+    @staticmethod
+    def stitch_words(words: list, show_lines: bool = False, scale_words: bool = False):
+        gap_width = 16
+        bottom_lines = []
+        top_lines = []
+        for i in range(len(words)):
+            b, t = detect_text_bounds(words[i])
+            bottom_lines.append(b)
+            top_lines.append(t)
+            if show_lines:
+                words[i] = cv2.line(words[i], (0, b), (words[i].shape[1], b), (0, 0, 1.0))
+                words[i] = cv2.line(words[i], (0, t), (words[i].shape[1], t), (1.0, 0, 0))
+        bottom_lines = np.array(bottom_lines, dtype=float)
+        if scale_words:
+            top_lines = np.array(top_lines, dtype=float)
+            gaps = bottom_lines - top_lines
+            target_gap = np.mean(gaps)
+            scales = target_gap / gaps
+            bottom_lines *= scales
+            top_lines *= scales
+            words = [cv2.resize(word, None, fx=scale, fy=scale) for word, scale in zip(words, scales)]
+        highest = np.max(bottom_lines)
+        offsets = highest - bottom_lines
+        height = np.max(offsets + [word.shape[0] for word in words])
+        result = np.ones((int(height), gap_width * len(words) + sum([w.shape[1] for w in words])))
+        x_pos = 0
+        for bottom_line, word in zip(bottom_lines, words):
+            offset = int(highest - bottom_line)
+            result[offset:offset + word.shape[0], x_pos:x_pos+word.shape[1]] = word
+            x_pos += word.shape[1] + gap_width
+        return result
+    @torch.no_grad()
+    def generate_fid(self, path, loader, model_tag, split='train', fake_only=False, long_tail_only=False):
+        if not isinstance(path, Path):
+            path = Path(path)
+        path.mkdir(exist_ok=True, parents=True)
+        appendix = f"{split}" if not long_tail_only else f"{split}_lt"
+        real_base = path / f'real_{appendix}'
+        fake_base = path / model_tag / f'fake_{appendix}'
+        if real_base.exists() and not fake_only:
+            shutil.rmtree(real_base)
+        if fake_base.exists():
+            shutil.rmtree(fake_base)
+        real_base.mkdir(exist_ok=True)
+        fake_base.mkdir(exist_ok=True, parents=True)
+        print('Saving images...')
+        print('  Saving images on {}'.format(str(real_base)))
+        print('  Saving images on {}'.format(str(fake_base)))
+        long_tail_chars = get_long_tail_chars()
+        counter = 0
+        ann = defaultdict(lambda: {})
+        start_time = time.time()
+        for step, data in enumerate(loader):
+            style_images = data['simg'].to(self.model.args.device)
+            texts = [l.decode('utf-8') for l in data['label']]
+            texts = [t.encode('utf-8') for t in texts]
+            eval_text_encode, eval_len_text, _ = self.model.netconverter.encode(texts)
+            eval_text_encode = eval_text_encode.to(self.model.args.device).unsqueeze(1)
+            vis_style = np.vstack(style_images[0].detach().cpu().numpy())
+            vis_style = ((vis_style + 1) / 2) * 255
+            fakes = self.model.netG.evaluate(style_images, eval_text_encode)
+            fake_images = torch.cat(fakes, 1).detach().cpu().numpy()
+            real_images = data['img'].detach().cpu().numpy()
+            writer_ids = data['wcl'].int().tolist()
+            for i, (fake, real, wid, lb, img_id) in enumerate(zip(fake_images, real_images, writer_ids, data['label'], data['idx'])):
+                lb = lb.decode()
+                ann[f"{wid:03d}"][f'{img_id:05d}'] = lb
+                img_id = f'{img_id:05d}.png'
+                is_long_tail = any(c in long_tail_chars for c in lb)
+                if long_tail_only and not is_long_tail:
+                    continue
+                fake_img_path = fake_base / f"{wid:03d}" / img_id
+                fake_img_path.parent.mkdir(exist_ok=True, parents=True)
+                cv2.imwrite(str(fake_img_path), 255 * ((fake.squeeze() + 1) / 2))
+                if not fake_only:
+                    real_img_path = real_base / f"{wid:03d}" / img_id
+                    real_img_path.parent.mkdir(exist_ok=True, parents=True)
+                    cv2.imwrite(str(real_img_path), 255 * ((real.squeeze() + 1) / 2))
+                counter += 1
+            eta = (time.time() - start_time) / (step + 1) * (len(loader) - step - 1)
+            eta = str(timedelta(seconds=eta))
+            if step % 100 == 0:
+                print(f'[{(step + 1) / len(loader) * 100:.02f}%][{counter:05d}] ETA {eta}')
+            with open(path / 'ann.json', 'w') as f:
+                json.dump(ann, f)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.46.2"
+}

hwt/config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "add_noise": false,
+  "alphabet": "Only thewigsofrcvdampbkuq.A-210xT5'MDL,RYHJ\"ISPWENj&BC93VGFKz();#:!7U64Q8?+*ZX/%",
+  "architectures": [
+    "VATrPP"
+  ],
+  "augment_ocr": false,
+  "batch_size": 8,
+  "corpus": "standard",
+  "d_crop_size": null,
+  "d_lr": 1e-05,
+  "dataset": "IAM",
+  "device": "cuda",
+  "english_words_path": "files/english_words.txt",
+  "epochs": 100000,
+  "feat_model_path": "files/resnet_18_pretrained.pth",
+  "file_suffix": null,
+  "g_lr": 5e-05,
+  "img_height": 32,
+  "is_cycle": false,
+  "label_encoder": "default",
+  "model_type": "emuru",
+  "no_ocr_loss": false,
+  "no_writer_loss": false,
+  "num_examples": 15,
+  "num_words": 3,
+  "num_workers": 0,
+  "num_writers": 339,
+  "ocr_lr": 5e-05,
+  "query_input": "unifont",
+  "resolution": 16,
+  "save_model": 5,
+  "save_model_history": 500,
+  "save_model_path": "saved_models",
+  "seed": 742,
+  "special_alphabet": "\u0391\u03b1\u0392\u03b2\u0393\u03b3\u0394\u03b4\u0395\u03b5\u0396\u03b6\u0397\u03b7\u0398\u03b8\u0399\u03b9\u039a\u03ba\u039b\u03bb\u039c\u03bc\u039d\u03bd\u039e\u03be\u039f\u03bf\u03a0\u03c0\u03a1\u03c1\u03a3\u03c3\u03c2\u03a4\u03c4\u03a5\u03c5\u03a6\u03c6\u03a7\u03c7\u03a8\u03c8\u03a9\u03c9",
+  "tag": "debug",
+  "text_aug_type": "proportional",
+  "text_augment_strength": 0.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
+  "vocab_size": 80,
+  "w_lr": 5e-05,
+  "wandb": false,
+  "writer_loss_weight": 1.0
+}

hwt/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.46.2"
+}

hwt/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c9bd990cdfd3a2a1683af05705c1f9a17b7f58b580a33853b0d0af7c57f7f2e
+size 560965208

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1e4b7cae23652acd5c559117d06ef42fdd5317da2a5e0bc94ea44d8c0eb1ff
+size 560965208

modeling_vatrpp.py ADDED Viewed

	@@ -0,0 +1,338 @@

+from transformers import PreTrainedModel
+from .configuration_vatrpp import VATrPPConfig
+import json
+import os
+import random
+import shutil
+from collections import defaultdict
+import time
+from datetime import timedelta
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from data.dataset import FolderDataset
+from models.model import VATr
+from util.loading import load_checkpoint, load_generator
+from util.misc import FakeArgs
+from util.text import TextGenerator
+from util.vision import detect_text_bounds
+from torchvision.transforms.functional import to_pil_image
+def get_long_tail_chars():
+    with open(f"files/longtail.txt", 'r') as f:
+        chars = [c.rstrip() for c in f]
+    chars.remove('')
+    return chars
+class VATrPP(PreTrainedModel):
+    config_class = VATrPPConfig
+    def __init__(self, config: VATrPPConfig) -> None:
+        super().__init__(config)
+        self.model = VATr(config)
+        self.model.eval()
+    def set_style_folder(self, style_folder, num_examples=15):
+        word_lengths = None
+        if os.path.exists(os.path.join(style_folder, "word_lengths.txt")):
+            word_lengths = {}
+            with open(os.path.join(style_folder, "word_lengths.txt"), 'r') as f:
+                for line in f:
+                    word, length = line.rstrip().split(",")
+                    word_lengths[word] = int(length)
+        self.style_dataset = FolderDataset(style_folder, num_examples=num_examples, word_lengths=word_lengths)
+    @torch.no_grad()
+    def generate(self, gen_text, style_imgs, align_words: bool = False, at_once: bool = False):
+        style_images = style_imgs.unsqueeze(0).to(self.model.args.device)
+        fake = self.create_fake_sentence(style_images, gen_text, align_words, at_once)
+        return to_pil_image(fake)
+    # @torch.no_grad()
+    # def generate(self, texts, align_words: bool = False, at_once: bool = False):
+    #     if isinstance(texts, str):
+    #         texts = [texts]
+    #     if self.style_dataset is None:
+    #         raise Exception('Style is not set')
+    #     fakes = []
+    #     for i, text in enumerate(texts, 1):
+    #         print(f'[{i}/{len(texts)}] Generating for text: {text}')
+    #         style = self.style_dataset.sample_style()
+    #         style_images = style['simg'].unsqueeze(0).to(self.model.args.device)
+    #         fake = self.create_fake_sentence(style_images, text, align_words, at_once)
+    #         fakes.append(fake)
+    #     return fakes
+    @torch.no_grad()
+    def create_fake_sentence(self, style_images, text, align_words=False, at_once=False):
+        text = "".join([c for c in text if c in self.model.args.alphabet])
+        text = text.split() if not at_once else [text]
+        gap = np.ones((32, 16))
+        text_encode, len_text, encode_pos = self.model.netconverter.encode(text)
+        text_encode = text_encode.to(self.model.args.device).unsqueeze(0)
+        fake = self.model._generate_fakes(style_images, text_encode, len_text)
+        if not at_once:
+            if align_words:
+                fake = self.stitch_words(fake, show_lines=False)
+            else:
+                fake = np.concatenate(sum([[img, gap] for img in fake], []), axis=1)[:, :-16]
+        else:
+            fake = fake[0]
+        fake = (fake * 255).astype(np.uint8)
+        return fake
+    @torch.no_grad()
+    def generate_authors(self, text, dataset, align_words: bool = False, at_once: bool = False):
+        fakes = []
+        author_ids = []
+        style = []
+        for item in dataset:
+            print(f"Generating author {item['wcl']}")
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            generated_lines = [self.create_fake_sentence(style_images, line, align_words, at_once) for line in text]
+            fakes.append(generated_lines)
+            author_ids.append(item['author_id'])
+            style.append((((item['simg'].numpy() + 1.0) / 2.0) * 255).astype(np.uint8))
+        return fakes, author_ids, style
+    @torch.no_grad()
+    def generate_characters(self, dataset, characters: str):
+        """
+        Generate each of the given characters for each of the authors in the dataset.
+        """
+        fakes = []
+        text_encode, len_text, encode_pos = self.model.netconverter.encode([c for c in characters])
+        text_encode = text_encode.to(self.model.args.device).unsqueeze(0)
+        for item in dataset:
+            print(f"Generating author {item['wcl']}")
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            fake = self.model.netG.evaluate(style_images, text_encode)
+            fakes.append(fake)
+        return fakes
+    @torch.no_grad()
+    def generate_batch(self, style_imgs, text):
+        """
+        Given a batch of style images and text, generate images using the model
+        """
+        device = self.model.args.device
+        text_encode, _, _ = self.model.netconverter.encode(text)
+        fakes, _ = self.model.netG(style_imgs.to(device), text_encode.to(device))
+        return fakes
+    @torch.no_grad()
+    def generate_ocr(self, dataset, number: int, output_folder: str = 'saved_images/ocr', interpolate_style: bool = False, text_generator: TextGenerator = None, long_tail: bool = False):
+        def create_and_write(style, text, interpolated=False):
+            nonlocal image_counter, annotations
+            text_encode, len_text, encode_pos = self.model.netconverter.encode([text])
+            text_encode = text_encode.to(self.model.args.device)
+            fake = self.model.netG.generate(style, text_encode)
+            fake = (fake + 1) / 2
+            fake = fake.cpu().numpy()
+            fake = np.squeeze((fake * 255).astype(np.uint8))
+            image_filename = f"{image_counter}.png" if not interpolated else f"{image_counter}_i.png"
+            cv2.imwrite(os.path.join(output_folder, "generated", image_filename), fake)
+            annotations.append((image_filename, text))
+            image_counter += 1
+        image_counter = 0
+        annotations = []
+        previous_style = None
+        long_tail_chars = get_long_tail_chars()
+        os.mkdir(os.path.join(output_folder, "generated"))
+        if text_generator is None:
+            os.mkdir(os.path.join(output_folder, "reference"))
+        while image_counter < number:
+            author_index = random.randint(0, len(dataset) - 1)
+            item = dataset[author_index]
+            style_images = item['simg'].to(self.model.args.device).unsqueeze(0)
+            style = self.model.netG.compute_style(style_images)
+            if interpolate_style and previous_style is not None:
+                factor = float(np.clip(random.gauss(0.5, 0.15), 0.0, 1.0))
+                intermediate_style = torch.lerp(previous_style, style, factor)
+                text = text_generator.generate()
+                create_and_write(intermediate_style, text, interpolated=True)
+            if text_generator is not None:
+                text = text_generator.generate()
+            else:
+                text = str(item['label'].decode())
+                if long_tail and not any(c in long_tail_chars for c in text):
+                    continue
+                fake = (item['img'] + 1) / 2
+                fake = fake.cpu().numpy()
+                fake = np.squeeze((fake * 255).astype(np.uint8))
+                image_filename = f"{image_counter}.png"
+                cv2.imwrite(os.path.join(output_folder, "reference", image_filename), fake)
+            create_and_write(style, text)
+            previous_style = style
+        if text_generator is None:
+            with open(os.path.join(output_folder, "reference", "labels.csv"), 'w') as fr:
+                fr.write(f"filename,words\n")
+                for annotation in annotations:
+                    fr.write(f"{annotation[0]},{annotation[1]}\n")
+        with open(os.path.join(output_folder, "generated", "labels.csv"), 'w') as fg:
+            fg.write(f"filename,words\n")
+            for annotation in annotations:
+                fg.write(f"{annotation[0]},{annotation[1]}\n")
+    @staticmethod
+    def stitch_words(words: list, show_lines: bool = False, scale_words: bool = False):
+        gap_width = 16
+        bottom_lines = []
+        top_lines = []
+        for i in range(len(words)):
+            b, t = detect_text_bounds(words[i])
+            bottom_lines.append(b)
+            top_lines.append(t)
+            if show_lines:
+                words[i] = cv2.line(words[i], (0, b), (words[i].shape[1], b), (0, 0, 1.0))
+                words[i] = cv2.line(words[i], (0, t), (words[i].shape[1], t), (1.0, 0, 0))
+        bottom_lines = np.array(bottom_lines, dtype=float)
+        if scale_words:
+            top_lines = np.array(top_lines, dtype=float)
+            gaps = bottom_lines - top_lines
+            target_gap = np.mean(gaps)
+            scales = target_gap / gaps
+            bottom_lines *= scales
+            top_lines *= scales
+            words = [cv2.resize(word, None, fx=scale, fy=scale) for word, scale in zip(words, scales)]
+        highest = np.max(bottom_lines)
+        offsets = highest - bottom_lines
+        height = np.max(offsets + [word.shape[0] for word in words])
+        result = np.ones((int(height), gap_width * len(words) + sum([w.shape[1] for w in words])))
+        x_pos = 0
+        for bottom_line, word in zip(bottom_lines, words):
+            offset = int(highest - bottom_line)
+            result[offset:offset + word.shape[0], x_pos:x_pos+word.shape[1]] = word
+            x_pos += word.shape[1] + gap_width
+        return result
+    @torch.no_grad()
+    def generate_fid(self, path, loader, model_tag, split='train', fake_only=False, long_tail_only=False):
+        if not isinstance(path, Path):
+            path = Path(path)
+        path.mkdir(exist_ok=True, parents=True)
+        appendix = f"{split}" if not long_tail_only else f"{split}_lt"
+        real_base = path / f'real_{appendix}'
+        fake_base = path / model_tag / f'fake_{appendix}'
+        if real_base.exists() and not fake_only:
+            shutil.rmtree(real_base)
+        if fake_base.exists():
+            shutil.rmtree(fake_base)
+        real_base.mkdir(exist_ok=True)
+        fake_base.mkdir(exist_ok=True, parents=True)
+        print('Saving images...')
+        print('  Saving images on {}'.format(str(real_base)))
+        print('  Saving images on {}'.format(str(fake_base)))
+        long_tail_chars = get_long_tail_chars()
+        counter = 0
+        ann = defaultdict(lambda: {})
+        start_time = time.time()
+        for step, data in enumerate(loader):
+            style_images = data['simg'].to(self.model.args.device)
+            texts = [l.decode('utf-8') for l in data['label']]
+            texts = [t.encode('utf-8') for t in texts]
+            eval_text_encode, eval_len_text, _ = self.model.netconverter.encode(texts)
+            eval_text_encode = eval_text_encode.to(self.model.args.device).unsqueeze(1)
+            vis_style = np.vstack(style_images[0].detach().cpu().numpy())
+            vis_style = ((vis_style + 1) / 2) * 255
+            fakes = self.model.netG.evaluate(style_images, eval_text_encode)
+            fake_images = torch.cat(fakes, 1).detach().cpu().numpy()
+            real_images = data['img'].detach().cpu().numpy()
+            writer_ids = data['wcl'].int().tolist()
+            for i, (fake, real, wid, lb, img_id) in enumerate(zip(fake_images, real_images, writer_ids, data['label'], data['idx'])):
+                lb = lb.decode()
+                ann[f"{wid:03d}"][f'{img_id:05d}'] = lb
+                img_id = f'{img_id:05d}.png'
+                is_long_tail = any(c in long_tail_chars for c in lb)
+                if long_tail_only and not is_long_tail:
+                    continue
+                fake_img_path = fake_base / f"{wid:03d}" / img_id
+                fake_img_path.parent.mkdir(exist_ok=True, parents=True)
+                cv2.imwrite(str(fake_img_path), 255 * ((fake.squeeze() + 1) / 2))
+                if not fake_only:
+                    real_img_path = real_base / f"{wid:03d}" / img_id
+                    real_img_path.parent.mkdir(exist_ok=True, parents=True)
+                    cv2.imwrite(str(real_img_path), 255 * ((real.squeeze() + 1) / 2))
+                counter += 1
+            eta = (time.time() - start_time) / (step + 1) * (len(loader) - step - 1)
+            eta = str(timedelta(seconds=eta))
+            if step % 100 == 0:
+                print(f'[{(step + 1) / len(loader) * 100:.02f}%][{counter:05d}] ETA {eta}')
+            with open(path / 'ann.json', 'w') as f:
+                json.dump(ann, f)

models/BigGAN_layers.py ADDED Viewed

	@@ -0,0 +1,469 @@

+''' Layers
+    This file contains various layers for the BigGAN models.
+'''
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.nn import Parameter as P
+from .sync_batchnorm import SynchronizedBatchNorm2d as SyncBN2d
+# Projection of x onto y
+def proj(x, y):
+    return torch.mm(y, x.t()) * y / torch.mm(y, y.t())
+# Orthogonalize x wrt list of vectors ys
+def gram_schmidt(x, ys):
+    for y in ys:
+        x = x - proj(x, y)
+    return x
+# Apply num_itrs steps of the power method to estimate top N singular values.
+def power_iteration(W, u_, update=True, eps=1e-12):
+    # Lists holding singular vectors and values
+    us, vs, svs = [], [], []
+    for i, u in enumerate(u_):
+        # Run one step of the power iteration
+        with torch.no_grad():
+            v = torch.matmul(u, W)
+            # Run Gram-Schmidt to subtract components of all other singular vectors
+            v = F.normalize(gram_schmidt(v, vs), eps=eps)
+            # Add to the list
+            vs += [v]
+            # Update the other singular vector
+            u = torch.matmul(v, W.t())
+            # Run Gram-Schmidt to subtract components of all other singular vectors
+            u = F.normalize(gram_schmidt(u, us), eps=eps)
+            # Add to the list
+            us += [u]
+            if update:
+                u_[i][:] = u
+        # Compute this singular value and add it to the list
+        svs += [torch.squeeze(torch.matmul(torch.matmul(v, W.t()), u.t()))]
+        # svs += [torch.sum(F.linear(u, W.transpose(0, 1)) * v)]
+    return svs, us, vs
+# Convenience passthrough function
+class identity(nn.Module):
+    def forward(self, input):
+        return input
+# Spectral normalization base class
+class SN(object):
+    def __init__(self, num_svs, num_itrs, num_outputs, transpose=False, eps=1e-12):
+        # Number of power iterations per step
+        self.num_itrs = num_itrs
+        # Number of singular values
+        self.num_svs = num_svs
+        # Transposed?
+        self.transpose = transpose
+        # Epsilon value for avoiding divide-by-0
+        self.eps = eps
+        # Register a singular vector for each sv
+        for i in range(self.num_svs):
+            self.register_buffer('u%d' % i, torch.randn(1, num_outputs))
+            self.register_buffer('sv%d' % i, torch.ones(1))
+    # Singular vectors (u side)
+    @property
+    def u(self):
+        return [getattr(self, 'u%d' % i) for i in range(self.num_svs)]
+    # Singular values;
+    # note that these buffers are just for logging and are not used in training.
+    @property
+    def sv(self):
+        return [getattr(self, 'sv%d' % i) for i in range(self.num_svs)]
+    # Compute the spectrally-normalized weight
+    def W_(self):
+        W_mat = self.weight.view(self.weight.size(0), -1)
+        if self.transpose:
+            W_mat = W_mat.t()
+        # Apply num_itrs power iterations
+        for _ in range(self.num_itrs):
+            svs, us, vs = power_iteration(W_mat, self.u, update=self.training, eps=self.eps)
+            # Update the svs
+        if self.training:
+            with torch.no_grad():  # Make sure to do this in a no_grad() context or you'll get memory leaks!
+                for i, sv in enumerate(svs):
+                    self.sv[i][:] = sv
+        return self.weight / svs[0]
+# 2D Conv layer with spectral norm
+class SNConv2d(nn.Conv2d, SN):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 num_svs=1, num_itrs=1, eps=1e-12):
+        nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, stride,
+                           padding, dilation, groups, bias)
+        SN.__init__(self, num_svs, num_itrs, out_channels, eps=eps)
+    def forward(self, x):
+        return F.conv2d(x, self.W_(), self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+# Linear layer with spectral norm
+class SNLinear(nn.Linear, SN):
+    def __init__(self, in_features, out_features,  bias=True,
+                 num_svs=1, num_itrs=1, eps=1e-12):
+        nn.Linear.__init__(self, in_features, out_features, bias)
+        SN.__init__(self, num_svs, num_itrs, out_features, eps=eps)
+    def forward(self, x):
+        return F.linear(x, self.W_(), self.bias)
+# Embedding layer with spectral norm
+# We use num_embeddings as the dim instead of embedding_dim here
+# for convenience sake
+class SNEmbedding(nn.Embedding, SN):
+    def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
+                 max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                 sparse=False, _weight=None,
+                 num_svs=1, num_itrs=1, eps=1e-12):
+        nn.Embedding.__init__(self, num_embeddings, embedding_dim, padding_idx,
+                              max_norm, norm_type, scale_grad_by_freq,
+                              sparse, _weight)
+        SN.__init__(self, num_svs, num_itrs, num_embeddings, eps=eps)
+    def forward(self, x):
+        return F.embedding(x, self.W_())
+# A non-local block as used in SA-GAN
+# Note that the implementation as described in the paper is largely incorrect;
+# refer to the released code for the actual implementation.
+class Attention(nn.Module):
+    def __init__(self, ch, which_conv=SNConv2d, name='attention'):
+        super(Attention, self).__init__()
+        # Channel multiplier
+        self.ch = ch
+        self.which_conv = which_conv
+        self.theta = self.which_conv(self.ch, self.ch // 8, kernel_size=1, padding=0, bias=False)
+        self.phi = self.which_conv(self.ch, self.ch // 8, kernel_size=1, padding=0, bias=False)
+        self.g = self.which_conv(self.ch, self.ch // 2, kernel_size=1, padding=0, bias=False)
+        self.o = self.which_conv(self.ch // 2, self.ch, kernel_size=1, padding=0, bias=False)
+        # Learnable gain parameter
+        self.gamma = P(torch.tensor(0.), requires_grad=True)
+    def forward(self, x, y=None):
+        # Apply convs
+        theta = self.theta(x)
+        phi = F.max_pool2d(self.phi(x), [2, 2])
+        g = F.max_pool2d(self.g(x), [2, 2])
+        # Perform reshapes
+        theta = theta.view(-1, self.ch // 8, x.shape[2] * x.shape[3])
+        try:
+            phi = phi.view(-1, self.ch // 8, x.shape[2] * x.shape[3] // 4)
+        except:
+            print(phi.shape)
+        g = g.view(-1, self.ch // 2, x.shape[2] * x.shape[3] // 4)
+        # Matmul and softmax to get attention maps
+        beta = F.softmax(torch.bmm(theta.transpose(1, 2), phi), -1)
+        # Attention map times g path
+        o = self.o(torch.bmm(g, beta.transpose(1, 2)).view(-1, self.ch // 2, x.shape[2], x.shape[3]))
+        return self.gamma * o + x
+# Fused batchnorm op
+def fused_bn(x, mean, var, gain=None, bias=None, eps=1e-5):
+    # Apply scale and shift--if gain and bias are provided, fuse them here
+    # Prepare scale
+    scale = torch.rsqrt(var + eps)
+    # If a gain is provided, use it
+    if gain is not None:
+        scale = scale * gain
+    # Prepare shift
+    shift = mean * scale
+    # If bias is provided, use it
+    if bias is not None:
+        shift = shift - bias
+    return x * scale - shift
+    # return ((x - mean) / ((var + eps) ** 0.5)) * gain + bias # The unfused way.
+# Manual BN
+# Calculate means and variances using mean-of-squares minus mean-squared
+def manual_bn(x, gain=None, bias=None, return_mean_var=False, eps=1e-5):
+    # Cast x to float32 if necessary
+    float_x = x.float()
+    # Calculate expected value of x (m) and expected value of x**2 (m2)
+    # Mean of x
+    m = torch.mean(float_x, [0, 2, 3], keepdim=True)
+    # Mean of x squared
+    m2 = torch.mean(float_x ** 2, [0, 2, 3], keepdim=True)
+    # Calculate variance as mean of squared minus mean squared.
+    var = (m2 - m ** 2)
+    # Cast back to float 16 if necessary
+    var = var.type(x.type())
+    m = m.type(x.type())
+    # Return mean and variance for updating stored mean/var if requested
+    if return_mean_var:
+        return fused_bn(x, m, var, gain, bias, eps), m.squeeze(), var.squeeze()
+    else:
+        return fused_bn(x, m, var, gain, bias, eps)
+# My batchnorm, supports standing stats
+class myBN(nn.Module):
+    def __init__(self, num_channels, eps=1e-5, momentum=0.1):
+        super(myBN, self).__init__()
+        # momentum for updating running stats
+        self.momentum = momentum
+        # epsilon to avoid dividing by 0
+        self.eps = eps
+        # Momentum
+        self.momentum = momentum
+        # Register buffers
+        self.register_buffer('stored_mean', torch.zeros(num_channels))
+        self.register_buffer('stored_var', torch.ones(num_channels))
+        self.register_buffer('accumulation_counter', torch.zeros(1))
+        # Accumulate running means and vars
+        self.accumulate_standing = False
+    # reset standing stats
+    def reset_stats(self):
+        self.stored_mean[:] = 0
+        self.stored_var[:] = 0
+        self.accumulation_counter[:] = 0
+    def forward(self, x, gain, bias):
+        if self.training:
+            out, mean, var = manual_bn(x, gain, bias, return_mean_var=True, eps=self.eps)
+            # If accumulating standing stats, increment them
+            if self.accumulate_standing:
+                self.stored_mean[:] = self.stored_mean + mean.data
+                self.stored_var[:] = self.stored_var + var.data
+                self.accumulation_counter += 1.0
+            # If not accumulating standing stats, take running averages
+            else:
+                self.stored_mean[:] = self.stored_mean * (1 - self.momentum) + mean * self.momentum
+                self.stored_var[:] = self.stored_var * (1 - self.momentum) + var * self.momentum
+            return out
+        # If not in training mode, use the stored statistics
+        else:
+            mean = self.stored_mean.view(1, -1, 1, 1)
+            var = self.stored_var.view(1, -1, 1, 1)
+            # If using standing stats, divide them by the accumulation counter
+            if self.accumulate_standing:
+                mean = mean / self.accumulation_counter
+                var = var / self.accumulation_counter
+            return fused_bn(x, mean, var, gain, bias, self.eps)
+# Simple function to handle groupnorm norm stylization
+def groupnorm(x, norm_style):
+    # If number of channels specified in norm_style:
+    if 'ch' in norm_style:
+        ch = int(norm_style.split('_')[-1])
+        groups = max(int(x.shape[1]) // ch, 1)
+    # If number of groups specified in norm style
+    elif 'grp' in norm_style:
+        groups = int(norm_style.split('_')[-1])
+    # If neither, default to groups = 16
+    else:
+        groups = 16
+    return F.group_norm(x, groups)
+# Class-conditional bn
+# output size is the number of channels, input size is for the linear layers
+# Andy's Note: this class feels messy but I'm not really sure how to clean it up
+# Suggestions welcome! (By which I mean, refactor this and make a pull request
+# if you want to make this more readable/usable).
+class ccbn(nn.Module):
+    def __init__(self, output_size, input_size, which_linear, eps=1e-5, momentum=0.1,
+                 cross_replica=False, mybn=False, norm_style='bn', ):
+        super(ccbn, self).__init__()
+        self.output_size, self.input_size = output_size, input_size
+        # Prepare gain and bias layers
+        self.gain = which_linear(input_size, output_size)
+        self.bias = which_linear(input_size, output_size)
+        # epsilon to avoid dividing by 0
+        self.eps = eps
+        # Momentum
+        self.momentum = momentum
+        # Use cross-replica batchnorm?
+        self.cross_replica = cross_replica
+        # Use my batchnorm?
+        self.mybn = mybn
+        # Norm style?
+        self.norm_style = norm_style
+        if self.cross_replica:
+            self.bn = SyncBN2d(output_size, eps=self.eps, momentum=self.momentum, affine=False)
+        elif self.mybn:
+            self.bn = myBN(output_size, self.eps, self.momentum)
+        elif self.norm_style in ['bn', 'in']:
+            self.register_buffer('stored_mean', torch.zeros(output_size))
+            self.register_buffer('stored_var', torch.ones(output_size))
+    def forward(self, x, y):
+        # Calculate class-conditional gains and biases
+        gain = (1 + self.gain(y)).view(y.size(0), -1, 1, 1)
+        bias = self.bias(y).view(y.size(0), -1, 1, 1)
+        # If using my batchnorm
+        if self.mybn or self.cross_replica:
+            return self.bn(x, gain=gain, bias=bias)
+        # else:
+        else:
+            if self.norm_style == 'bn':
+                out = F.batch_norm(x, self.stored_mean, self.stored_var, None, None,
+                                   self.training, 0.1, self.eps)
+            elif self.norm_style == 'in':
+                out = F.instance_norm(x, self.stored_mean, self.stored_var, None, None,
+                                      self.training, 0.1, self.eps)
+            elif self.norm_style == 'gn':
+                out = groupnorm(x, self.normstyle)
+            elif self.norm_style == 'nonorm':
+                out = x
+            return out * gain + bias
+    def extra_repr(self):
+        s = 'out: {output_size}, in: {input_size},'
+        s += ' cross_replica={cross_replica}'
+        return s.format(**self.__dict__)
+# Normal, non-class-conditional BN
+class bn(nn.Module):
+    def __init__(self, output_size, eps=1e-5, momentum=0.1,
+                 cross_replica=False, mybn=False):
+        super(bn, self).__init__()
+        self.output_size = output_size
+        # Prepare gain and bias layers
+        self.gain = P(torch.ones(output_size), requires_grad=True)
+        self.bias = P(torch.zeros(output_size), requires_grad=True)
+        # epsilon to avoid dividing by 0
+        self.eps = eps
+        # Momentum
+        self.momentum = momentum
+        # Use cross-replica batchnorm?
+        self.cross_replica = cross_replica
+        # Use my batchnorm?
+        self.mybn = mybn
+        if self.cross_replica:
+            self.bn = SyncBN2d(output_size, eps=self.eps, momentum=self.momentum, affine=False)
+        elif mybn:
+            self.bn = myBN(output_size, self.eps, self.momentum)
+        # Register buffers if neither of the above
+        else:
+            self.register_buffer('stored_mean', torch.zeros(output_size))
+            self.register_buffer('stored_var', torch.ones(output_size))
+    def forward(self, x, y=None):
+        if self.cross_replica or self.mybn:
+            gain = self.gain.view(1, -1, 1, 1)
+            bias = self.bias.view(1, -1, 1, 1)
+            return self.bn(x, gain=gain, bias=bias)
+        else:
+            return F.batch_norm(x, self.stored_mean, self.stored_var, self.gain,
+                                self.bias, self.training, self.momentum, self.eps)
+# Generator blocks
+# Note that this class assumes the kernel size and padding (and any other
+# settings) have been selected in the main generator module and passed in
+# through the which_conv arg. Similar rules apply with which_bn (the input
+# size [which is actually the number of channels of the conditional info] must
+# be preselected)
+class GBlock(nn.Module):
+    def __init__(self, in_channels, out_channels,
+                 which_conv1=nn.Conv2d, which_conv2=nn.Conv2d, which_bn=bn, activation=None,
+                 upsample=None):
+        super(GBlock, self).__init__()
+        self.in_channels, self.out_channels = in_channels, out_channels
+        self.which_conv1, self.which_conv2, self.which_bn = which_conv1, which_conv2, which_bn
+        self.activation = activation
+        self.upsample = upsample
+        # Conv layers
+        self.conv1 = self.which_conv1(self.in_channels, self.out_channels)
+        self.conv2 = self.which_conv2(self.out_channels, self.out_channels)
+        self.learnable_sc = in_channels != out_channels or upsample
+        if self.learnable_sc:
+            self.conv_sc = self.which_conv1(in_channels, out_channels,
+                                           kernel_size=1, padding=0)
+        # Batchnorm layers
+        self.bn1 = self.which_bn(in_channels)
+        self.bn2 = self.which_bn(out_channels)
+        # upsample layers
+        self.upsample = upsample
+    def forward(self, x, y):
+        h = self.activation(self.bn1(x, y))
+        # h = self.activation(x)
+        # h=x
+        if self.upsample:
+            h = self.upsample(h)
+            x = self.upsample(x)
+        h = self.conv1(h)
+        h = self.activation(self.bn2(h, y))
+        # h = self.activation(h)
+        h = self.conv2(h)
+        if self.learnable_sc:
+            x = self.conv_sc(x)
+        return h + x
+# Residual block for the discriminator
+class DBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, which_conv=SNConv2d, wide=True,
+                 preactivation=False, activation=None, downsample=None, ):
+        super(DBlock, self).__init__()
+        self.in_channels, self.out_channels = in_channels, out_channels
+        # If using wide D (as in SA-GAN and BigGAN), change the channel pattern
+        self.hidden_channels = self.out_channels if wide else self.in_channels
+        self.which_conv = which_conv
+        self.preactivation = preactivation
+        self.activation = activation
+        self.downsample = downsample
+        # Conv layers
+        self.conv1 = self.which_conv(self.in_channels, self.hidden_channels)
+        self.conv2 = self.which_conv(self.hidden_channels, self.out_channels)
+        self.learnable_sc = True if (in_channels != out_channels) or downsample else False
+        if self.learnable_sc:
+            self.conv_sc = self.which_conv(in_channels, out_channels,
+                                           kernel_size=1, padding=0)
+    def shortcut(self, x):
+        if self.preactivation:
+            if self.learnable_sc:
+                x = self.conv_sc(x)
+            if self.downsample:
+                x = self.downsample(x)
+        else:
+            if self.downsample:
+                x = self.downsample(x)
+            if self.learnable_sc:
+                x = self.conv_sc(x)
+        return x
+    def forward(self, x):
+        if self.preactivation:
+            # h = self.activation(x) # NOT TODAY SATAN
+            # Andy's note: This line *must* be an out-of-place ReLU or it
+            #              will negatively affect the shortcut connection.
+            h = F.relu(x)
+        else:
+            h = x
+        h = self.conv1(h)
+        h = self.conv2(self.activation(h))
+        if self.downsample:
+            h = self.downsample(h)
+        return h + self.shortcut(x)
+# dogball

models/BigGAN_networks.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT
+import functools
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+from util.augmentations import ProgressiveWordCrop, CycleWordCrop, StaticWordCrop, RandomWordCrop
+from . import BigGAN_layers as layers
+from .networks import init_weights
+import torchvision
+# Attention is passed in in the format '32_64' to mean applying an attention
+# block at both resolution 32x32 and 64x64. Just '64' will apply at 64x64.
+from models.blocks import Conv2dBlock, ResBlocks
+# Discriminator architecture, same paradigm as G's above
+def D_arch(ch=64, attention='64', input_nc=3, ksize='333333', dilation='111111'):
+    arch = {}
+    arch[256] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 2, 4, 8, 8, 16, 16]],
+                 'downsample': [True] * 6 + [False],
+                 'resolution': [128, 64, 32, 16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 8)}}
+    arch[128] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 2, 4, 8, 16, 16]],
+                 'downsample': [True] * 5 + [False],
+                 'resolution': [64, 32, 16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 8)}}
+    arch[64] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8]],
+                'out_channels': [item * ch for item in [1, 2, 4, 8, 16]],
+                'downsample': [True] * 4 + [False],
+                'resolution': [32, 16, 8, 4, 4],
+                'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                              for i in range(2, 7)}}
+    arch[63] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8]],
+                'out_channels': [item * ch for item in [1, 2, 4, 8, 16]],
+                'downsample': [True] * 4 + [False],
+                'resolution': [32, 16, 8, 4, 4],
+                'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                              for i in range(2, 7)}}
+    arch[32] = {'in_channels': [input_nc] + [item * ch for item in [4, 4, 4]],
+                'out_channels': [item * ch for item in [4, 4, 4, 4]],
+                'downsample': [True, True, False, False],
+                'resolution': [16, 16, 16, 16],
+                'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                              for i in range(2, 6)}}
+    arch[129] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 2, 4, 8, 8, 16, 16]],
+                 'downsample': [True] * 6 + [False],
+                 'resolution': [128, 64, 32, 16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 8)}}
+    arch[33] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 2, 4, 8, 16, 16]],
+                 'downsample': [True] * 5 + [False],
+                 'resolution': [64, 32, 16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 10)}}
+    arch[31] = {'in_channels': [input_nc] + [ch * item for item in [1, 2, 4, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 2, 4, 8, 16, 16]],
+                 'downsample': [True] * 5 + [False],
+                 'resolution': [64, 32, 16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 10)}}
+    arch[16] = {'in_channels': [input_nc] + [ch * item for item in [1, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 8, 16, 16]],
+                 'downsample': [True] * 3 + [False],
+                 'resolution': [16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 5)}}
+    arch[17] = {'in_channels': [input_nc] + [ch * item for item in [1, 4]],
+                 'out_channels': [item * ch for item in [1, 4, 8]],
+                 'downsample': [True] * 3,
+                 'resolution': [16, 8, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 5)}}
+    arch[20] = {'in_channels': [input_nc] + [ch * item for item in [1, 8, 16]],
+                 'out_channels': [item * ch for item in [1, 8, 16, 16]],
+                 'downsample': [True] * 3 + [False],
+                 'resolution': [16, 8, 4, 4],
+                 'attention': {2 ** i: 2 ** i in [int(item) for item in attention.split('_')]
+                               for i in range(2, 5)}}
+    return arch
+class Discriminator(nn.Module):
+    def __init__(self, resolution, D_ch=64, D_wide=True, D_kernel_size=3, D_attn='64',
+                 num_D_SVs=1, num_D_SV_itrs=1, D_activation=nn.ReLU(inplace=False),
+                 SN_eps=1e-8, output_dim=1, D_mixed_precision=False, D_fp16=False,
+                 D_init='N02', skip_init=False, D_param='SN', gpu_ids=[0],bn_linear='SN', input_nc=1, one_hot=False, crop_size: list = None, **kwargs):
+        super(Discriminator, self).__init__()
+        self.crop = crop_size is not None and len(crop_size) > 0
+        use_padding = False
+        if self.crop:
+            w_crop = StaticWordCrop(crop_size[0], use_padding=use_padding) if len(crop_size) == 1 else RandomWordCrop(crop_size[0], crop_size[1], use_padding=use_padding)
+            self.augmenter = w_crop
+        self.name = 'D'
+        # gpu_ids
+        self.gpu_ids = gpu_ids
+        # one_hot representation
+        self.one_hot = one_hot
+        # Width multiplier
+        self.ch = D_ch
+        # Use Wide D as in BigGAN and SA-GAN or skinny D as in SN-GAN?
+        self.D_wide = D_wide
+        # Resolution
+        self.resolution = resolution
+        # Kernel size
+        self.kernel_size = D_kernel_size
+        # Attention?
+        self.attention = D_attn
+        # Activation
+        self.activation = D_activation
+        # Initialization style
+        self.init = D_init
+        # Parameterization style
+        self.D_param = D_param
+        # Epsilon for Spectral Norm?
+        self.SN_eps = SN_eps
+        # Fp16?
+        self.fp16 = D_fp16
+        # Architecture
+        self.arch = D_arch(self.ch, self.attention, input_nc)[resolution]
+        # Which convs, batchnorms, and linear layers to use
+        # No option to turn off SN in D right now
+        if self.D_param == 'SN':
+            self.which_conv = functools.partial(layers.SNConv2d,
+                                                kernel_size=3, padding=1,
+                                                num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                eps=self.SN_eps)
+            self.which_linear = functools.partial(layers.SNLinear,
+                                                  num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                  eps=self.SN_eps)
+            self.which_embedding = functools.partial(layers.SNEmbedding,
+                                                     num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                     eps=self.SN_eps)
+            if bn_linear=='SN':
+                self.which_embedding = functools.partial(layers.SNLinear,
+                                                         num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                         eps=self.SN_eps)
+        else:
+            self.which_conv = functools.partial(nn.Conv2d, kernel_size=3, padding=1)
+            self.which_linear = nn.Linear
+            # We use a non-spectral-normed embedding here regardless;
+            # For some reason applying SN to G's embedding seems to randomly cripple G
+            self.which_embedding = nn.Embedding
+        if one_hot:
+            self.which_embedding = functools.partial(layers.SNLinear,
+                                                         num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                         eps=self.SN_eps)
+        # Prepare model
+        # self.blocks is a doubly-nested list of modules, the outer loop intended
+        # to be over blocks at a given resolution (resblocks and/or self-attention)
+        self.blocks = []
+        for index in range(len(self.arch['out_channels'])):
+            self.blocks += [[layers.DBlock(in_channels=self.arch['in_channels'][index],
+                                           out_channels=self.arch['out_channels'][index],
+                                           which_conv=self.which_conv,
+                                           wide=self.D_wide,
+                                           activation=self.activation,
+                                           preactivation=(index > 0),
+                                           downsample=(nn.AvgPool2d(2) if self.arch['downsample'][index] else None))]]
+            # If attention on this block, attach it to the end
+            if self.arch['attention'][self.arch['resolution'][index]]:
+                print('Adding attention layer in D at resolution %d' % self.arch['resolution'][index])
+                self.blocks[-1] += [layers.Attention(self.arch['out_channels'][index],
+                                                     self.which_conv)]
+        # Turn self.blocks into a ModuleList so that it's all properly registered.
+        self.blocks = nn.ModuleList([nn.ModuleList(block) for block in self.blocks])
+        # Linear output layer. The output dimension is typically 1, but may be
+        # larger if we're e.g. turning this into a VAE with an inference output
+        self.dropout = torch.nn.Dropout(p=0.5)
+        self.linear = self.which_linear(self.arch['out_channels'][-1], output_dim)
+        # Initialize weights
+        if not skip_init:
+            self = init_weights(self, D_init)
+    def update_parameters(self, epoch: int):
+        if self.crop:
+            self.augmenter.update(epoch)
+    def forward(self, x, y=None, **kwargs):
+        # Stick x into h for cleaner for loops without flow control
+        if self.crop and random.uniform(0.0, 1.0) < 0.33:
+            x = self.augmenter(x)
+        #imgs = [np.squeeze((img.detach().cpu().numpy() + 1.0) / 2.0) for img in x]
+        #imgs = (np.vstack(imgs) * 255.0).astype(np.uint8)
+        #cv2.imwrite(f"saved_images/debug/{random.randint(0, 1000)}.jpg", imgs)
+        h = x
+        # Loop over blocks
+        for index, blocklist in enumerate(self.blocks):
+            for block in blocklist:
+                h = block(h)
+        # Apply global sum pooling as in SN-GAN
+        h = torch.sum(self.activation(h), [2, 3])
+        out = self.linear(h)
+        return out
+    def return_features(self, x, y=None):
+        # Stick x into h for cleaner for loops without flow control
+        h = x
+        block_output = []
+        # Loop over blocks
+        for index, blocklist in enumerate(self.blocks):
+            for block in blocklist:
+                h = block(h)
+                block_output.append(h)
+        # Apply global sum pooling as in SN-GAN
+        # h = torch.sum(self.activation(h), [2, 3])
+        return block_output
+class WDiscriminator(nn.Module):
+    def __init__(self, resolution, n_classes, output_dim, D_ch=64, D_wide=True, D_kernel_size=3, D_attn='64',
+                 num_D_SVs=1, num_D_SV_itrs=1, D_activation=nn.ReLU(inplace=False),
+                 SN_eps=1e-8, D_mixed_precision=False, D_fp16=False,
+                 D_init='N02', skip_init=False, D_param='SN', gpu_ids=[0],bn_linear='SN', input_nc=1, one_hot=False):
+        super(WDiscriminator, self).__init__()
+        self.name = 'D'
+        # gpu_ids
+        self.gpu_ids = gpu_ids
+        # one_hot representation
+        self.one_hot = one_hot
+        # Width multiplier
+        self.ch = D_ch
+        # Use Wide D as in BigGAN and SA-GAN or skinny D as in SN-GAN?
+        self.D_wide = D_wide
+        # Resolution
+        self.resolution = resolution
+        # Kernel size
+        self.kernel_size = D_kernel_size
+        # Attention?
+        self.attention = D_attn
+        # Number of classes
+        self.n_classes = n_classes
+        # Activation
+        self.activation = D_activation
+        # Initialization style
+        self.init = D_init
+        # Parameterization style
+        self.D_param = D_param
+        # Epsilon for Spectral Norm?
+        self.SN_eps = SN_eps
+        # Fp16?
+        self.fp16 = D_fp16
+        # Architecture
+        self.arch = D_arch(self.ch, self.attention, input_nc)[resolution]
+        # Which convs, batchnorms, and linear layers to use
+        # No option to turn off SN in D right now
+        if self.D_param == 'SN':
+            self.which_conv = functools.partial(layers.SNConv2d,
+                                                kernel_size=3, padding=1,
+                                                num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                eps=self.SN_eps)
+            self.which_linear = functools.partial(layers.SNLinear,
+                                                  num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                  eps=self.SN_eps)
+            self.which_embedding = functools.partial(layers.SNEmbedding,
+                                                     num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                     eps=self.SN_eps)
+            if bn_linear == 'SN':
+                self.which_embedding = functools.partial(layers.SNLinear,
+                                                         num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                         eps=self.SN_eps)
+        else:
+            self.which_conv = functools.partial(nn.Conv2d, kernel_size=3, padding=1)
+            self.which_linear = nn.Linear
+            # We use a non-spectral-normed embedding here regardless;
+            # For some reason applying SN to G's embedding seems to randomly cripple G
+            self.which_embedding = nn.Embedding
+        if one_hot:
+            self.which_embedding = functools.partial(layers.SNLinear,
+                                                     num_svs=num_D_SVs, num_itrs=num_D_SV_itrs,
+                                                     eps=self.SN_eps)
+        # Prepare model
+        # self.blocks is a doubly-nested list of modules, the outer loop intended
+        # to be over blocks at a given resolution (resblocks and/or self-attention)
+        self.blocks = []
+        for index in range(len(self.arch['out_channels'])):
+            self.blocks += [[layers.DBlock(in_channels=self.arch['in_channels'][index],
+                                           out_channels=self.arch['out_channels'][index],
+                                           which_conv=self.which_conv,
+                                           wide=self.D_wide,
+                                           activation=self.activation,
+                                           preactivation=(index > 0),
+                                           downsample=(nn.AvgPool2d(2) if self.arch['downsample'][index] else None))]]
+            # If attention on this block, attach it to the end
+            if self.arch['attention'][self.arch['resolution'][index]]:
+                print('Adding attention layer in D at resolution %d' % self.arch['resolution'][index])
+                self.blocks[-1] += [layers.Attention(self.arch['out_channels'][index],
+                                                     self.which_conv)]
+        # Turn self.blocks into a ModuleList so that it's all properly registered.
+        self.blocks = nn.ModuleList([nn.ModuleList(block) for block in self.blocks])
+        # Linear output layer. The output dimension is typically 1, but may be
+        # larger if we're e.g. turning this into a VAE with an inference output
+        self.dropout = torch.nn.Dropout(p=0.5)
+        self.linear = self.which_linear(self.arch['out_channels'][-1], output_dim)
+        # Embedding for projection discrimination
+        self.embed = self.which_embedding(self.n_classes, self.arch['out_channels'][-1])
+        self.cross_entropy = nn.CrossEntropyLoss()
+        # Initialize weights
+        if not skip_init:
+            self = init_weights(self, D_init)
+    def update_parameters(self, epoch: int):
+        pass
+    def forward(self, x, y=None, **kwargs):
+        # Stick x into h for cleaner for loops without flow control
+        h = x
+        # Loop over blocks
+        for index, blocklist in enumerate(self.blocks):
+            for block in blocklist:
+                h = block(h)
+        # Apply global sum pooling as in SN-GAN
+        h = torch.sum(self.activation(h), [2, 3])
+        # Get initial class-unconditional output
+        out = self.linear(h)
+        # Get projection of final featureset onto class vectors and add to evidence
+        #if y is not None:
+        loss = self.cross_entropy(out, y.long())
+        return loss
+    def return_features(self, x, y=None):
+        # Stick x into h for cleaner for loops without flow control
+        h = x
+        block_output = []
+        # Loop over blocks
+        for index, blocklist in enumerate(self.blocks):
+            for block in blocklist:
+                h = block(h)
+                block_output.append(h)
+        # Apply global sum pooling as in SN-GAN
+        # h = torch.sum(self.activation(h), [2, 3])
+        return block_output
+class Encoder(Discriminator):
+    def __init__(self, opt, output_dim, **kwargs):
+        super(Encoder, self).__init__(**vars(opt))
+        self.output_layer = nn.Sequential(self.activation,
+                                          nn.Conv2d(self.arch['out_channels'][-1], output_dim, kernel_size=(4,2), padding=0, stride=2))
+    def forward(self, x):
+        # Stick x into h for cleaner for loops without flow control
+        h = x
+        # Loop over blocks
+        for index, blocklist in enumerate(self.blocks):
+            for block in blocklist:
+                h = block(h)
+        out = self.output_layer(h)
+        return out

models/OCR_network.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+from .networks import *
+class BidirectionalLSTM(nn.Module):
+    def __init__(self, nIn, nHidden, nOut):
+        super(BidirectionalLSTM, self).__init__()
+        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
+        self.embedding = nn.Linear(nHidden * 2, nOut)
+    def forward(self, input):
+        recurrent, _ = self.rnn(input)
+        T, b, h = recurrent.size()
+        t_rec = recurrent.view(T * b, h)
+        output = self.embedding(t_rec)  # [T * b, nOut]
+        output = output.view(T, b, -1)
+        return output
+class CRNN(nn.Module):
+    def __init__(self, args, leakyRelu=False):
+        super(CRNN, self).__init__()
+        self.args = args
+        self.name = 'OCR'
+        self.add_noise = False
+        self.noise_fac = torch.distributions.Normal(loc=torch.tensor([0.]), scale=torch.tensor([0.2]))
+        #assert opt.imgH % 16 == 0, 'imgH has to be a multiple of 16'
+        ks = [3, 3, 3, 3, 3, 3, 2]
+        ps = [1, 1, 1, 1, 1, 1, 0]
+        ss = [1, 1, 1, 1, 1, 1, 1]
+        nm = [64, 128, 256, 256, 512, 512, 512]
+        cnn = nn.Sequential()
+        nh = 256
+        dealwith_lossnone=False # whether to replace all nan/inf in gradients to zero
+        def convRelu(i, batchNormalization=False):
+            nIn = 1 if i == 0 else nm[i - 1]
+            nOut = nm[i]
+            cnn.add_module('conv{0}'.format(i),
+                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
+            if batchNormalization:
+                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
+            if leakyRelu:
+                cnn.add_module('relu{0}'.format(i),
+                               nn.LeakyReLU(0.2, inplace=True))
+            else:
+                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
+        convRelu(0)
+        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
+        convRelu(1)
+        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
+        convRelu(2, True)
+        convRelu(3)
+        cnn.add_module('pooling{0}'.format(2),
+                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
+        convRelu(4, True)
+        if self.args.resolution==63:
+            cnn.add_module('pooling{0}'.format(3),
+                           nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
+        convRelu(5)
+        cnn.add_module('pooling{0}'.format(4),
+                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
+        convRelu(6, True)  # 512x1x16
+        self.cnn = cnn
+        self.use_rnn = False
+        if self.use_rnn:
+            self.rnn = nn.Sequential(
+                BidirectionalLSTM(512, nh, nh),
+                BidirectionalLSTM(nh, nh, ))
+        else:
+            self.linear = nn.Linear(512, self.args.vocab_size)
+        # replace all nan/inf in gradients to zero
+        if dealwith_lossnone:
+            self.register_backward_hook(self.backward_hook)
+        self.device = torch.device('cuda:{}'.format(0))
+        self.init = 'N02'
+        # Initialize weights
+        self = init_weights(self, self.init)
+    def forward(self, input):
+        # conv features
+        if self.add_noise:
+            input = input + self.noise_fac.sample(input.size()).squeeze(-1).to(self.args.device)
+        conv = self.cnn(input)
+        b, c, h, w = conv.size()
+        if h!=1:
+            print('a')
+        assert h == 1, "the height of conv must be 1"
+        conv = conv.squeeze(2)
+        conv = conv.permute(2, 0, 1)  # [w, b, c]
+        if self.use_rnn:
+            # rnn features
+            output = self.rnn(conv)
+        else:
+            output = self.linear(conv)
+        return output
+    def backward_hook(self, module, grad_input, grad_output):
+        for g in grad_input:
+            g[g != g] = 0  # replace all nan/inf in gradients to zero
+class strLabelConverter(object):
+    """Convert between str and label.
+    NOTE:
+        Insert `blank` to the alphabet for CTC.
+    Args:
+        alphabet (str): set of the possible characters.
+        ignore_case (bool, default=True): whether or not to ignore all of the case.
+    """
+    def __init__(self, alphabet, ignore_case=False):
+        self._ignore_case = ignore_case
+        if self._ignore_case:
+            alphabet = alphabet.lower()
+        self.alphabet = alphabet + '-'  # for `-1` index
+        self.dict = {}
+        for i, char in enumerate(alphabet):
+            # NOTE: 0 is reserved for 'blank' required by wrap_ctc
+            self.dict[char] = i + 1
+    def encode(self, text):
+        """Support batch or single str.
+        Args:
+            text (str or list of str): texts to convert.
+        Returns:
+            torch.IntTensor [length_0 + length_1 + ... length_{n - 1}]: encoded texts.
+            torch.IntTensor [n]: length of each text.
+        """
+        length = []
+        result = []
+        results = []
+        for item in text:
+            if isinstance(item, bytes): item = item.decode('utf-8', 'strict')
+            length.append(len(item))
+            for char in item:
+                index = self.dict[char]
+                result.append(index)
+            results.append(result)
+            result = []
+        return torch.nn.utils.rnn.pad_sequence([torch.LongTensor(text) for text in results], batch_first=True), torch.IntTensor(length), None
+    def decode(self, t, length, raw=False):
+        """Decode encoded texts back into strs.
+        Args:
+            torch.IntTensor [length_0 + length_1 + ... length_{n - 1}]: encoded texts.
+            torch.IntTensor [n]: length of each text.
+        Raises:
+            AssertionError: when the texts and its length does not match.
+        Returns:
+            text (str or list of str): texts to convert.
+        """
+        if length.numel() == 1:
+            length = length[0]
+            assert t.numel() == length, "text with length: {} does not match declared length: {}".format(t.numel(),
+                                                                                                         length)
+            if raw:
+                return ''.join([self.alphabet[i - 1] for i in t])
+            else:
+                char_list = []
+                for i in range(length):
+                    if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])):
+                        char_list.append(self.alphabet[t[i] - 1])
+                return ''.join(char_list)
+        else:
+            # batch mode
+            assert t.numel() == length.sum(), "texts with length: {} does not match declared length: {}".format(
+                t.numel(), length.sum())
+            texts = []
+            index = 0
+            for i in range(length.numel()):
+                l = length[i]
+                texts.append(
+                    self.decode(
+                        t[index:index + l], torch.IntTensor([l]), raw=raw))
+                index += l
+            return texts

models/__init__.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""This package contains modules related to objective functions, optimizations, and network architectures.
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+    -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+    -- <set_input>:                     unpack data from dataset and apply preprocessing.
+    -- <forward>:                       produce intermediate results.
+    -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
+    -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+In the function <__init__>, you need to define four lists:
+    -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+    -- self.model_names (str list):         define networks used in our training.
+    -- self.visual_names (str list):        specify the images that you want to display and save.
+    -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+Now you can use the model class by specifying flag '--model dummy'.
+"""
+import importlib
+def find_model_using_name(model_name):
+    """Import the module "models/[model_name]_model.py".
+    In the file, the class called DatasetNameModel() will
+    be instantiated. It has to be a subclass of BaseModel,
+    and it is case-insensitive.
+    """
+    model_filename = "models." + model_name + "_model"
+    modellib = importlib.import_module(model_filename)
+    model = None
+    target_model_name = model_name.replace('_', '') + 'model'
+    for name, cls in modellib.__dict__.items():
+        if name.lower() == target_model_name.lower() \
+           and issubclass(cls, BaseModel):
+            model = cls
+    if model is None:
+        print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+        exit(0)
+    return model
+def get_option_setter(model_name):
+    """Return the static method <modify_commandline_options> of the model class."""
+    model_class = find_model_using_name(model_name)
+    return model_class.modify_commandline_options
+def create_model(opt):
+    """Create a model given the option.
+    This function warps the class CustomDatasetDataLoader.
+    This is the main interface between this package and 'train.py'/'test.py'
+    Example:
+        >>> from models import create_model
+        >>> model = create_model(opt)
+    """
+    model = find_model_using_name(opt.model)
+    instance = model(opt)
+    print("model [%s] was created" % type(instance).__name__)
+    return instance

models/blocks.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+class ResBlocks(nn.Module):
+    def __init__(self, num_blocks, dim, norm, activation, pad_type):
+        super(ResBlocks, self).__init__()
+        self.model = []
+        for i in range(num_blocks):
+            self.model += [ResBlock(dim,
+                                    norm=norm,
+                                    activation=activation,
+                                    pad_type=pad_type)]
+        self.model = nn.Sequential(*self.model)
+    def forward(self, x):
+        return self.model(x)
+class ResBlock(nn.Module):
+    def __init__(self, dim, norm='in', activation='relu', pad_type='zero'):
+        super(ResBlock, self).__init__()
+        model = []
+        model += [Conv2dBlock(dim, dim, 3, 1, 1,
+                              norm=norm,
+                              activation=activation,
+                              pad_type=pad_type)]
+        model += [Conv2dBlock(dim, dim, 3, 1, 1,
+                              norm=norm,
+                              activation='none',
+                              pad_type=pad_type)]
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        residual = x
+        out = self.model(x)
+        out += residual
+        return out
+class ActFirstResBlock(nn.Module):
+    def __init__(self, fin, fout, fhid=None,
+                 activation='lrelu', norm='none'):
+        super().__init__()
+        self.learned_shortcut = (fin != fout)
+        self.fin = fin
+        self.fout = fout
+        self.fhid = min(fin, fout) if fhid is None else fhid
+        self.conv_0 = Conv2dBlock(self.fin, self.fhid, 3, 1,
+                                  padding=1, pad_type='reflect', norm=norm,
+                                  activation=activation, activation_first=True)
+        self.conv_1 = Conv2dBlock(self.fhid, self.fout, 3, 1,
+                                  padding=1, pad_type='reflect', norm=norm,
+                                  activation=activation, activation_first=True)
+        if self.learned_shortcut:
+            self.conv_s = Conv2dBlock(self.fin, self.fout, 1, 1,
+                                      activation='none', use_bias=False)
+    def forward(self, x):
+        x_s = self.conv_s(x) if self.learned_shortcut else x
+        dx = self.conv_0(x)
+        dx = self.conv_1(dx)
+        out = x_s + dx
+        return out
+class LinearBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, norm='none', activation='relu'):
+        super(LinearBlock, self).__init__()
+        use_bias = True
+        self.fc = nn.Linear(in_dim, out_dim, bias=use_bias)
+        # initialize normalization
+        norm_dim = out_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm1d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm1d(norm_dim)
+        elif norm == 'none':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=False)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=False)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+    def forward(self, x):
+        out = self.fc(x)
+        if self.norm:
+            out = self.norm(out)
+        if self.activation:
+            out = self.activation(out)
+        return out
+class Conv2dBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, ks, st, padding=0,
+                 norm='none', activation='relu', pad_type='zero',
+                 use_bias=True, activation_first=False):
+        super(Conv2dBlock, self).__init__()
+        self.use_bias = use_bias
+        self.activation_first = activation_first
+        # initialize padding
+        if pad_type == 'reflect':
+            self.pad = nn.ReflectionPad2d(padding)
+        elif pad_type == 'replicate':
+            self.pad = nn.ReplicationPad2d(padding)
+        elif pad_type == 'zero':
+            self.pad = nn.ZeroPad2d(padding)
+        else:
+            assert 0, "Unsupported padding type: {}".format(pad_type)
+        # initialize normalization
+        norm_dim = out_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm2d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm2d(norm_dim)
+        elif norm == 'adain':
+            self.norm = AdaptiveInstanceNorm2d(norm_dim)
+        elif norm == 'none':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=False)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=False)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+        self.conv = nn.Conv2d(in_dim, out_dim, ks, st, bias=self.use_bias)
+    def forward(self, x):
+        if self.activation_first:
+            if self.activation:
+                x = self.activation(x)
+            x = self.conv(self.pad(x))
+            if self.norm:
+                x = self.norm(x)
+        else:
+            x = self.conv(self.pad(x))
+            if self.norm:
+                x = self.norm(x)
+            if self.activation:
+                x = self.activation(x)
+        return x
+class AdaptiveInstanceNorm2d(nn.Module):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1):
+        super(AdaptiveInstanceNorm2d, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.weight = None
+        self.bias = None
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+    def forward(self, x):
+        assert self.weight is not None and \
+               self.bias is not None, "Please assign AdaIN weight first"
+        b, c = x.size(0), x.size(1)
+        running_mean = self.running_mean.repeat(b)
+        running_var = self.running_var.repeat(b)
+        x_reshaped = x.contiguous().view(1, b * c, *x.size()[2:])
+        out = F.batch_norm(
+            x_reshaped, running_mean, running_var, self.weight, self.bias,
+            True, self.momentum, self.eps)
+        return out.view(b, c, *x.size()[2:])
+    def __repr__(self):
+        return self.__class__.__name__ + '(' + str(self.num_features) + ')'

models/config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+tn_hidden_dim = 512
+tn_dropout = 0.1
+tn_nheads = 8
+tn_dim_feedforward = 512
+tn_enc_layers = 3
+tn_dec_layers = 3

models/inception.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = models.inception_v3(pretrained=True)
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.interpolate(x,
+                              size=(299, 299),
+                              mode='bilinear',
+                              align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = models.inception_v3(num_classes=1008,
+                                    aux_logits=False,
+                                    weights=None,
+                                    init_weights=False)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    inception.load_state_dict(state_dict)
+    return inception
+class FIDInceptionA(models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionC(models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_1(models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_2(models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)

models/model.py ADDED Viewed

	@@ -0,0 +1,894 @@

+import torch.utils.data
+from torch.nn import CTCLoss
+from torch.nn.utils import clip_grad_norm_
+import sys
+import torchvision.models as models
+from models.inception import InceptionV3
+from models.transformer import *
+from util.augmentations import OCRAugment
+from util.misc import SmoothedValue
+from util.text import get_generator, AugmentedGenerator
+from .BigGAN_networks import *
+from .OCR_network import *
+from models.blocks import Conv2dBlock, ResBlocks
+from util.util import loss_hinge_dis, loss_hinge_gen, make_one_hot
+import models.config as config
+from .positional_encodings import PositionalEncoding1D
+from models.unifont_module import UnifontModule
+from PIL import Image
+def get_rgb(x):
+    R = 255 - int(int(x > 0.5) * 255 * (x - 0.5) / 0.5)
+    G = 0
+    B = 255 + int(int(x < 0.5) * 255 * (x - 0.5) / 0.5)
+    return R, G, B
+def get_page_from_words(word_lists, MAX_IMG_WIDTH=800):
+    line_all = []
+    line_t = []
+    width_t = 0
+    for i in word_lists:
+        width_t = width_t + i.shape[1] + 16
+        if width_t > MAX_IMG_WIDTH:
+            line_all.append(np.concatenate(line_t, 1))
+            line_t = []
+            width_t = i.shape[1] + 16
+        line_t.append(i)
+        line_t.append(np.ones((i.shape[0], 16)))
+    if len(line_all) == 0:
+        line_all.append(np.concatenate(line_t, 1))
+    max_lin_widths = MAX_IMG_WIDTH  # max([i.shape[1] for i in line_all])
+    gap_h = np.ones([16, max_lin_widths])
+    page_ = []
+    for l in line_all:
+        pad_ = np.ones([l.shape[0], max_lin_widths - l.shape[1]])
+        page_.append(np.concatenate([l, pad_], 1))
+        page_.append(gap_h)
+    page = np.concatenate(page_, 0)
+    return page * 255
+class FCNDecoder(nn.Module):
+    def __init__(self, ups=3, n_res=2, dim=512, out_dim=1, res_norm='adain', activ='relu', pad_type='reflect'):
+        super(FCNDecoder, self).__init__()
+        self.model = []
+        self.model += [ResBlocks(n_res, dim, res_norm,
+                                 activ, pad_type=pad_type)]
+        for i in range(ups):
+            self.model += [nn.Upsample(scale_factor=2),
+                           Conv2dBlock(dim, dim // 2, 5, 1, 2,
+                                       norm='in',
+                                       activation=activ,
+                                       pad_type=pad_type)]
+            dim //= 2
+        self.model += [Conv2dBlock(dim, out_dim, 7, 1, 3,
+                                   norm='none',
+                                   activation='tanh',
+                                   pad_type=pad_type)]
+        self.model = nn.Sequential(*self.model)
+    def forward(self, x):
+        y = self.model(x)
+        return y
+class Generator(nn.Module):
+    def __init__(self, args):
+        super(Generator, self).__init__()
+        self.args = args
+        INP_CHANNEL = 1
+        encoder_layer = TransformerEncoderLayer(config.tn_hidden_dim, config.tn_nheads,
+                                                config.tn_dim_feedforward,
+                                                config.tn_dropout, "relu", True)
+        encoder_norm = nn.LayerNorm(config.tn_hidden_dim) if True else None
+        self.encoder = TransformerEncoder(encoder_layer, config.tn_enc_layers, encoder_norm)
+        decoder_layer = TransformerDecoderLayer(config.tn_hidden_dim, config.tn_nheads,
+                                                config.tn_dim_feedforward,
+                                                config.tn_dropout, "relu", True)
+        decoder_norm = nn.LayerNorm(config.tn_hidden_dim)
+        self.decoder = TransformerDecoder(decoder_layer, config.tn_dec_layers, decoder_norm,
+                                          return_intermediate=True)
+        self.Feat_Encoder = models.resnet18(weights='ResNet18_Weights.DEFAULT')
+        self.Feat_Encoder.conv1 = nn.Conv2d(INP_CHANNEL, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.Feat_Encoder.fc = nn.Identity()
+        self.Feat_Encoder.avgpool = nn.Identity()
+        # self.query_embed = nn.Embedding(self.args.vocab_size, self.args.tn_hidden_dim)
+        self.query_embed = UnifontModule(
+            config.tn_dim_feedforward,
+            self.args.alphabet + self.args.special_alphabet,
+            input_type=self.args.query_input,
+            device=self.args.device
+        )
+        self.pos_encoder = PositionalEncoding1D(config.tn_hidden_dim)
+        self.linear_q = nn.Linear(config.tn_dim_feedforward, config.tn_dim_feedforward * 8)
+        self.DEC = FCNDecoder(res_norm='in', dim=config.tn_hidden_dim)
+        self.noise = torch.distributions.Normal(loc=torch.tensor([0.]), scale=torch.tensor([1.0]))
+    def evaluate(self, style_images, queries):
+        style = self.compute_style(style_images)
+        results = []
+        for i in range(queries.shape[1]):
+            query = queries[:, i, :]
+            h = self.generate(style, query)
+            results.append(h.detach())
+        return results
+    def compute_style(self, style_images):
+        B, N, R, C = style_images.shape
+        FEAT_ST = self.Feat_Encoder(style_images.view(B * N, 1, R, C))
+        FEAT_ST = FEAT_ST.view(B, 512, 1, -1)
+        FEAT_ST_ENC = FEAT_ST.flatten(2).permute(2, 0, 1)
+        memory = self.encoder(FEAT_ST_ENC)
+        return memory
+    def generate(self, style_vector, query):
+        query_embed = self.query_embed(query).permute(1, 0, 2)
+        tgt = torch.zeros_like(query_embed)
+        hs = self.decoder(tgt, style_vector, query_pos=query_embed)
+        h = hs.transpose(1, 2)[-1]
+        if self.args.add_noise:
+            h = h + self.noise.sample(h.size()).squeeze(-1).to(self.args.device)
+        h = self.linear_q(h)
+        h = h.contiguous()
+        h = h.view(h.size(0), h.shape[1] * 2, 4, -1)
+        h = h.permute(0, 3, 2, 1)
+        h = self.DEC(h)
+        return h
+    def forward(self, style_images, query):
+        enc_attn_weights, dec_attn_weights = [], []
+        self.hooks = [
+            self.encoder.layers[-1].self_attn.register_forward_hook(
+                lambda self, input, output: enc_attn_weights.append(output[1])
+            ),
+            self.decoder.layers[-1].multihead_attn.register_forward_hook(
+                lambda self, input, output: dec_attn_weights.append(output[1])
+            ),
+        ]
+        style = self.compute_style(style_images)
+        h = self.generate(style, query)
+        self.dec_attn_weights = dec_attn_weights[-1].detach()
+        self.enc_attn_weights = enc_attn_weights[-1].detach()
+        for hook in self.hooks:
+            hook.remove()
+        return h, style
+class VATr(nn.Module):
+    def __init__(self, args):
+        super(VATr, self).__init__()
+        self.args = args
+        self.args.vocab_size = len(args.alphabet)
+        self.epsilon = 1e-7
+        self.netG = Generator(self.args).to(self.args.device)
+        self.netD = Discriminator(
+            resolution=self.args.resolution, crop_size=args.d_crop_size,
+        ).to(self.args.device)
+        self.netW = WDiscriminator(resolution=self.args.resolution, n_classes=self.args.vocab_size, output_dim=self.args.num_writers)
+        self.netW = self.netW.to(self.args.device)
+        self.netconverter = strLabelConverter(self.args.alphabet + self.args.special_alphabet)
+        self.netOCR = CRNN(self.args).to(self.args.device)
+        self.ocr_augmenter = OCRAugment(prob=0.5, no=3)
+        self.OCR_criterion = CTCLoss(zero_infinity=True, reduction='none')
+        block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[2048]
+        self.inception = InceptionV3([block_idx]).to(self.args.device)
+        self.optimizer_G = torch.optim.Adam(self.netG.parameters(),
+                                            lr=self.args.g_lr, betas=(0.0, 0.999), weight_decay=0, eps=1e-8)
+        self.optimizer_OCR = torch.optim.Adam(self.netOCR.parameters(),
+                                              lr=self.args.ocr_lr, betas=(0.0, 0.999), weight_decay=0, eps=1e-8)
+        self.optimizer_D = torch.optim.Adam(self.netD.parameters(),
+                                            lr=self.args.d_lr, betas=(0.0, 0.999), weight_decay=0, eps=1e-8)
+        self.optimizer_wl = torch.optim.Adam(self.netW.parameters(),
+                                             lr=self.args.w_lr, betas=(0.0, 0.999), weight_decay=0, eps=1e-8)
+        self.optimizers = [self.optimizer_G, self.optimizer_OCR, self.optimizer_D, self.optimizer_wl]
+        self.optimizer_G.zero_grad()
+        self.optimizer_OCR.zero_grad()
+        self.optimizer_D.zero_grad()
+        self.optimizer_wl.zero_grad()
+        self.loss_G = 0
+        self.loss_D = 0
+        self.loss_Dfake = 0
+        self.loss_Dreal = 0
+        self.loss_OCR_fake = 0
+        self.loss_OCR_real = 0
+        self.loss_w_fake = 0
+        self.loss_w_real = 0
+        self.Lcycle = 0
+        self.d_acc = SmoothedValue()
+        self.word_generator = get_generator(args)
+        self.epoch = 0
+        with open('mytext.txt', 'r', encoding='utf-8') as f:
+            self.text = f.read()
+            self.text = self.text.replace('\n', ' ')
+            self.text = self.text.replace('\n', ' ')
+            self.text = ''.join(c for c in self.text if c in (self.args.alphabet + self.args.special_alphabet))  # just to avoid problems with the font dataset
+            self.text = [word.encode() for word in self.text.split()]  # [:args.num_examples]
+        self.eval_text_encode, self.eval_len_text, self.eval_encode_pos = self.netconverter.encode(self.text)
+        self.eval_text_encode = self.eval_text_encode.to(self.args.device).repeat(self.args.batch_size, 1, 1)
+        self.rv_sample_size = 64 * 4
+        self.last_fakes = []
+    def update_last_fakes(self, fakes):
+        for fake in fakes:
+            self.last_fakes.append(fake)
+        self.last_fakes = self.last_fakes[-self.rv_sample_size:]
+    def update_acc(self, pred_real, pred_fake):
+        correct = (pred_real >= 0.5).float().sum() + (pred_fake < 0.5).float().sum()
+        self.d_acc.update(correct / (len(pred_real) + len(pred_fake)))
+    def set_text_aug_strength(self, strength):
+        if not isinstance(self.word_generator, AugmentedGenerator):
+            print("WARNING: Text generator is not augmented, strength cannot be set")
+        else:
+            self.word_generator.set_strength(strength)
+    def get_text_aug_strength(self):
+        if isinstance(self.word_generator, AugmentedGenerator):
+            return self.word_generator.strength
+        else:
+            return 0.0
+    def update_parameters(self, epoch: int):
+        self.epoch = epoch
+        self.netD.update_parameters(epoch)
+        self.netW.update_parameters(epoch)
+    def get_text_sample(self, size: int) -> list:
+        return [self.word_generator.generate() for _ in range(size)]
+    def _generate_fakes(self, ST, eval_text_encode=None, eval_len_text=None):
+        if eval_text_encode == None:
+            eval_text_encode = self.eval_text_encode
+        if eval_len_text == None:
+            eval_len_text = self.eval_len_text
+        self.fakes = self.netG.evaluate(ST, eval_text_encode)
+        np_fakes = []
+        for batch_idx in range(self.fakes[0].shape[0]):
+            for idx, fake in enumerate(self.fakes):
+                fake = fake[batch_idx, 0, :, :eval_len_text[idx] * self.args.resolution]
+                fake = (fake + 1) / 2
+                np_fakes.append(fake.cpu().numpy())
+        return np_fakes
+    def _generate_page(self, ST, SLEN, eval_text_encode=None, eval_len_text=None, eval_encode_pos=None, lwidth=260, rwidth=980):
+        # ST -> Style?
+        if eval_text_encode == None:
+            eval_text_encode = self.eval_text_encode
+        if eval_len_text == None:
+            eval_len_text = self.eval_len_text
+        if eval_encode_pos is None:
+            eval_encode_pos = self.eval_encode_pos
+        text_encode, text_len, _ = self.netconverter.encode(self.args.special_alphabet)
+        symbols = self.netG.query_embed.symbols[text_encode].reshape(-1, 16, 16).cpu().numpy()
+        imgs = [Image.fromarray(s).resize((32, 32), resample=0) for s in symbols]
+        special_examples = 1 - np.concatenate([np.array(i) for i in imgs], axis=-1)
+        self.fakes = self.netG.evaluate(ST, eval_text_encode)
+        page1s = []
+        page2s = []
+        for batch_idx in range(ST.shape[0]):
+            word_t = []
+            word_l = []
+            gap = np.ones([self.args.img_height, 16])
+            line_wids = []
+            for idx, fake_ in enumerate(self.fakes):
+                word_t.append((fake_[batch_idx, 0, :, :eval_len_text[idx] * self.args.resolution].cpu().numpy() + 1) / 2)
+                word_t.append(gap)
+                if sum(t.shape[-1] for t in word_t) >= rwidth or idx == len(self.fakes) - 1 or (len(self.fakes) - len(self.args.special_alphabet) - 1) == idx:
+                    line_ = np.concatenate(word_t, -1)
+                    word_l.append(line_)
+                    line_wids.append(line_.shape[1])
+                    word_t = []
+            # add the examples from the UnifontModules
+            word_l.append(special_examples)
+            line_wids.append(special_examples.shape[1])
+            gap_h = np.ones([16, max(line_wids)])
+            page_ = []
+            for l in word_l:
+                pad_ = np.ones([self.args.img_height, max(line_wids) - l.shape[1]])
+                page_.append(np.concatenate([l, pad_], 1))
+                page_.append(gap_h)
+            page1 = np.concatenate(page_, 0)
+            word_t = []
+            word_l = []
+            line_wids = []
+            sdata_ = [i.unsqueeze(1) for i in torch.unbind(ST, 1)]
+            gap = np.ones([sdata_[0].shape[-2], 16])
+            for idx, st in enumerate((sdata_)):
+                word_t.append((st[batch_idx, 0, :, :int(SLEN.cpu().numpy()[batch_idx][idx])].cpu().numpy() + 1) / 2)
+                # word_t.append((st[batch_idx, 0, :, :].cpu().numpy() + 1) / 2)
+                word_t.append(gap)
+                if sum(t.shape[-1] for t in word_t) >= lwidth or idx == len(sdata_) - 1:
+                    line_ = np.concatenate(word_t, -1)
+                    word_l.append(line_)
+                    line_wids.append(line_.shape[1])
+                    word_t = []
+            gap_h = np.ones([16, max(line_wids)])
+            page_ = []
+            for l in word_l:
+                pad_ = np.ones([sdata_[0].shape[-2], max(line_wids) - l.shape[1]])
+                page_.append(np.concatenate([l, pad_], 1))
+                page_.append(gap_h)
+            page2 = np.concatenate(page_, 0)
+            merge_w_size = max(page1.shape[0], page2.shape[0])
+            if page1.shape[0] != merge_w_size:
+                page1 = np.concatenate([page1, np.ones([merge_w_size - page1.shape[0], page1.shape[1]])], 0)
+            if page2.shape[0] != merge_w_size:
+                page2 = np.concatenate([page2, np.ones([merge_w_size - page2.shape[0], page2.shape[1]])], 0)
+            page1s.append(page1)
+            page2s.append(page2)
+            # page = np.concatenate([page2, page1], 1)
+        page1s_ = np.concatenate(page1s, 0)
+        max_wid = max([i.shape[1] for i in page2s])
+        padded_page2s = []
+        for para in page2s:
+            padded_page2s.append(np.concatenate([para, np.ones([para.shape[0], max_wid - para.shape[1]])], 1))
+        padded_page2s_ = np.concatenate(padded_page2s, 0)
+        return np.concatenate([padded_page2s_, page1s_], 1)
+    def get_current_losses(self):
+        losses = {}
+        losses['G'] = self.loss_G
+        losses['D'] = self.loss_D
+        losses['Dfake'] = self.loss_Dfake
+        losses['Dreal'] = self.loss_Dreal
+        losses['OCR_fake'] = self.loss_OCR_fake
+        losses['OCR_real'] = self.loss_OCR_real
+        losses['w_fake'] = self.loss_w_fake
+        losses['w_real'] = self.loss_w_real
+        losses['cycle'] = self.Lcycle
+        return losses
+    def _set_input(self, input):
+        self.input = input
+        self.real = self.input['img'].to(self.args.device)
+        self.label = self.input['label']
+        self.set_ocr_data(self.input['img'], self.input['label'])
+        self.sdata = self.input['simg'].to(self.args.device)
+        self.slabels = self.input['slabels']
+        self.ST_LEN = self.input['swids']
+    def set_requires_grad(self, nets, requires_grad=False):
+        """Set requies_grad=Fasle for all the networks to avoid unnecessary computations
+        Parameters:
+            nets (network list)   -- a list of networks
+            requires_grad (bool)  -- whether the networks require gradients or not
+        """
+        if not isinstance(nets, list):
+            nets = [nets]
+        for net in nets:
+            if net is not None:
+                for param in net.parameters():
+                    param.requires_grad = requires_grad
+    def forward(self):
+        self.text_encode, self.len_text, self.encode_pos = self.netconverter.encode(self.label)
+        self.text_encode = self.text_encode.to(self.args.device).detach()
+        self.len_text = self.len_text.detach()
+        self.words = [self.word_generator.generate().encode('utf-8') for _ in range(self.args.batch_size)]
+        self.text_encode_fake, self.len_text_fake, self.encode_pos_fake = self.netconverter.encode(self.words)
+        self.text_encode_fake = self.text_encode_fake.to(self.args.device)
+        self.one_hot_fake = make_one_hot(self.text_encode_fake, self.len_text_fake, self.args.vocab_size).to(
+            self.args.device)
+        self.fake, self.style = self.netG(self.sdata, self.text_encode_fake)
+        self.update_last_fakes(self.fake)
+    def pad_width(self, t, new_width):
+        result = torch.ones((t.size(0), t.size(1), t.size(2), new_width), device=t.device)
+        result[:,:,:,:t.size(-1)] = t
+        return result
+    def compute_real_ocr_loss(self, ocr_network = None):
+        network = ocr_network if ocr_network is not None else self.netOCR
+        real_input = self.ocr_images
+        input_images = real_input
+        input_labels = self.ocr_labels
+        input_images = input_images.detach()
+        if self.ocr_augmenter is not None:
+            input_images = self.ocr_augmenter(input_images)
+        pred_real = network(input_images)
+        preds_size = torch.IntTensor([pred_real.size(0)] * len(input_labels)).detach()
+        text_encode, len_text, _ = self.netconverter.encode(input_labels)
+        loss = self.OCR_criterion(pred_real, text_encode.detach(), preds_size, len_text.detach())
+        return torch.mean(loss[~torch.isnan(loss)])
+    def compute_fake_ocr_loss(self, ocr_network = None):
+        network = ocr_network if ocr_network is not None else self.netOCR
+        pred_fake_OCR = network(self.fake)
+        preds_size = torch.IntTensor([pred_fake_OCR.size(0)] * self.args.batch_size).detach()
+        loss_OCR_fake = self.OCR_criterion(pred_fake_OCR, self.text_encode_fake.detach(), preds_size,
+                                           self.len_text_fake.detach())
+        return torch.mean(loss_OCR_fake[~torch.isnan(loss_OCR_fake)])
+    def set_ocr_data(self, images, labels):
+        self.ocr_images = images.to(self.args.device)
+        self.ocr_labels = labels
+    def backward_D_OCR(self):
+        self.real.__repr__()
+        self.fake.__repr__()
+        pred_real = self.netD(self.real.detach())
+        pred_fake = self.netD(**{'x': self.fake.detach()})
+        self.update_acc(pred_real, pred_fake)
+        self.loss_Dreal, self.loss_Dfake = loss_hinge_dis(pred_fake, pred_real, self.len_text_fake.detach(),
+                                                          self.len_text.detach(), True)
+        self.loss_D = self.loss_Dreal + self.loss_Dfake
+        if not self.args.no_ocr_loss:
+            self.loss_OCR_real = self.compute_real_ocr_loss()
+            loss_total = self.loss_D + self.loss_OCR_real
+        else:
+            loss_total = self.loss_D
+        # backward
+        loss_total.backward()
+        if not self.args.no_ocr_loss:
+            self.clean_grad(self.netOCR.parameters())
+        return loss_total
+    def clean_grad(self, params):
+        for param in params:
+            param.grad[param.grad != param.grad] = 0
+            param.grad[torch.isnan(param.grad)] = 0
+            param.grad[torch.isinf(param.grad)] = 0
+    def backward_D_WL(self):
+        # Real
+        pred_real = self.netD(self.real.detach())
+        pred_fake = self.netD(**{'x': self.fake.detach()})
+        self.update_acc(pred_real, pred_fake)
+        self.loss_Dreal, self.loss_Dfake = loss_hinge_dis(pred_fake, pred_real, self.len_text_fake.detach(),
+                                                          self.len_text.detach(), True)
+        self.loss_D = self.loss_Dreal + self.loss_Dfake
+        if not self.args.no_writer_loss:
+            self.loss_w_real = self.netW(self.real.detach(), self.input['wcl'].to(self.args.device)).mean()
+            # total loss
+            loss_total = self.loss_D + self.loss_w_real * self.args.writer_loss_weight
+        else:
+            loss_total = self.loss_D
+        # backward
+        loss_total.backward()
+        return loss_total
+    def optimize_D_WL(self):
+        self.forward()
+        self.set_requires_grad([self.netD], True)
+        self.set_requires_grad([self.netOCR], False)
+        self.set_requires_grad([self.netW], True)
+        self.set_requires_grad([self.netW], True)
+        self.optimizer_D.zero_grad()
+        self.optimizer_wl.zero_grad()
+        self.backward_D_WL()
+    def optimize_D_WL_step(self):
+        self.optimizer_D.step()
+        self.optimizer_wl.step()
+        self.optimizer_D.zero_grad()
+        self.optimizer_wl.zero_grad()
+    def compute_cycle_loss(self):
+        fake_input = torch.ones_like(self.sdata)
+        width = min(self.sdata.size(-1), self.fake.size(-1))
+        fake_input[:, :, :, :width] = self.fake.repeat(1, 15, 1, 1)[:, :, :, :width]
+        with torch.no_grad():
+            fake_style = self.netG.compute_style(fake_input)
+        return torch.sum(torch.abs(self.style.detach() - fake_style), dim=1).mean()
+    def backward_G_only(self):
+        self.gb_alpha = 0.7
+        if self.args.is_cycle:
+            self.Lcycle = self.compute_cycle_loss()
+        self.loss_G = loss_hinge_gen(self.netD(**{'x': self.fake}), self.len_text_fake.detach(), True).mean()
+        compute_ocr = not self.args.no_ocr_loss
+        if compute_ocr:
+            self.loss_OCR_fake = self.compute_fake_ocr_loss()
+        self.loss_G = self.loss_G + self.Lcycle
+        if compute_ocr:
+            self.loss_T = self.loss_G + self.loss_OCR_fake
+        else:
+            self.loss_T = self.loss_G
+        if compute_ocr:
+            grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, retain_graph=True)[0]
+            self.loss_grad_fake_OCR = 10 ** 6 * torch.mean(grad_fake_OCR ** 2)
+        grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, retain_graph=True)[0]
+        self.loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2)
+        self.loss_T.backward(retain_graph=True)
+        if compute_ocr:
+            grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, create_graph=True, retain_graph=True)[0]
+            grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=True, retain_graph=True)[0]
+            a = self.gb_alpha * torch.div(torch.std(grad_fake_adv), self.epsilon + torch.std(grad_fake_OCR))
+            self.loss_OCR_fake = a.detach() * self.loss_OCR_fake
+            self.loss_T = self.loss_G + self.loss_OCR_fake
+        else:
+            grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=True, retain_graph=True)[0]
+            a = 1
+            self.loss_T = self.loss_G
+        if a is None:
+            print(self.loss_OCR_fake, self.loss_G, torch.std(grad_fake_adv))
+        if a > 1000 or a < 0.0001:
+            print(f'WARNING: alpha > 1000 or alpha < 0.0001 - alpha={a.item()}')
+        self.loss_T.backward(retain_graph=True)
+        if compute_ocr:
+            grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, create_graph=False, retain_graph=True)[0]
+            self.loss_grad_fake_OCR = 10 ** 6 * torch.mean(grad_fake_OCR ** 2)
+        grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=False, retain_graph=True)[0]
+        self.loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2)
+        with torch.no_grad():
+            self.loss_T.backward()
+        if compute_ocr:
+            if any(torch.isnan(torch.unsqueeze(self.loss_OCR_fake, dim=0))) or torch.isnan(self.loss_G):
+                print('loss OCR fake: ', self.loss_OCR_fake, ' loss_G: ', self.loss_G, ' words: ', self.words)
+                sys.exit()
+    def backward_G_WL(self):
+        self.gb_alpha = 0.7
+        if self.args.is_cycle:
+            self.Lcycle = self.compute_cycle_loss()
+        self.loss_G = loss_hinge_gen(self.netD(**{'x': self.fake}), self.len_text_fake.detach(), True).mean()
+        if not self.args.no_writer_loss:
+            self.loss_w_fake = self.netW(self.fake, self.input['wcl'].to(self.args.device)).mean()
+        self.loss_G = self.loss_G + self.Lcycle
+        if not self.args.no_writer_loss:
+            self.loss_T = self.loss_G + self.loss_w_fake * self.args.writer_loss_weight
+        else:
+            self.loss_T = self.loss_G
+        self.loss_T.backward(retain_graph=True)
+        if not self.args.no_writer_loss:
+            grad_fake_WL = torch.autograd.grad(self.loss_w_fake, self.fake, create_graph=True, retain_graph=True)[0]
+            grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=True, retain_graph=True)[0]
+            a = self.gb_alpha * torch.div(torch.std(grad_fake_adv), self.epsilon + torch.std(grad_fake_WL))
+            self.loss_w_fake = a.detach() * self.loss_w_fake
+            self.loss_T = self.loss_G + self.loss_w_fake
+        else:
+            grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=True, retain_graph=True)[0]
+            a = 1
+            self.loss_T = self.loss_G
+        if a is None:
+            print(self.loss_w_fake, self.loss_G, torch.std(grad_fake_adv))
+        if a > 1000 or a < 0.0001:
+            print(f'WARNING: alpha > 1000 or alpha < 0.0001 - alpha={a.item()}')
+        self.loss_T.backward(retain_graph=True)
+        if not self.args.no_writer_loss:
+            grad_fake_WL = torch.autograd.grad(self.loss_w_fake, self.fake, create_graph=False, retain_graph=True)[0]
+            self.loss_grad_fake_WL = 10 ** 6 * torch.mean(grad_fake_WL ** 2)
+        grad_fake_adv = torch.autograd.grad(self.loss_G, self.fake, create_graph=False, retain_graph=True)[0]
+        self.loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2)
+        with torch.no_grad():
+            self.loss_T.backward()
+    def backward_G(self):
+        self.opt.gb_alpha = 0.7
+        self.loss_G = loss_hinge_gen(self.netD(**{'x': self.fake, 'z': self.z}), self.len_text_fake.detach(),
+                                     self.opt.mask_loss)
+        # OCR loss on real data
+        compute_ocr = not self.args.no_ocr_loss
+        if compute_ocr:
+            self.loss_OCR_fake = self.compute_fake_ocr_loss()
+        else:
+            self.loss_OCR_fake = 0.0
+        self.loss_w_fake = self.netW(self.fake, self.wcl)
+        # self.loss_OCR_fake = self.loss_OCR_fake + self.loss_w_fake
+        # total loss
+        # l1 = self.params[0]*self.loss_G
+        # l2 = self.params[0]*self.loss_OCR_fake
+        # l3 = self.params[0]*self.loss_w_fake
+        self.loss_G_ = 10 * self.loss_G + self.loss_w_fake
+        self.loss_T = self.loss_G_ + self.loss_OCR_fake
+        grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, retain_graph=True)[0]
+        self.loss_grad_fake_OCR = 10 ** 6 * torch.mean(grad_fake_OCR ** 2)
+        grad_fake_adv = torch.autograd.grad(self.loss_G_, self.fake, retain_graph=True)[0]
+        self.loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2)
+        if not False:
+            self.loss_T.backward(retain_graph=True)
+            grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, create_graph=True, retain_graph=True)[0]
+            grad_fake_adv = torch.autograd.grad(self.loss_G_, self.fake, create_graph=True, retain_graph=True)[0]
+            # grad_fake_wl = torch.autograd.grad(self.loss_w_fake, self.fake, create_graph=True, retain_graph=True)[0]
+            a = self.opt.gb_alpha * torch.div(torch.std(grad_fake_adv), self.epsilon + torch.std(grad_fake_OCR))
+            # a0 = self.opt.gb_alpha * torch.div(torch.std(grad_fake_adv), self.epsilon+torch.std(grad_fake_wl))
+            if a is None:
+                print(self.loss_OCR_fake, self.loss_G_, torch.std(grad_fake_adv), torch.std(grad_fake_OCR))
+            if a > 1000 or a < 0.0001:
+                print(f'WARNING: alpha > 1000 or alpha < 0.0001 - alpha={a.item()}')
+            b = self.opt.gb_alpha * (torch.mean(grad_fake_adv) -
+                                     torch.div(torch.std(grad_fake_adv), self.epsilon + torch.std(grad_fake_OCR)) *
+                                     torch.mean(grad_fake_OCR))
+            # self.loss_OCR_fake = a.detach() * self.loss_OCR_fake + b.detach() * torch.sum(self.fake)
+            self.loss_OCR_fake = a.detach() * self.loss_OCR_fake
+            # self.loss_w_fake = a0.detach() * self.loss_w_fake
+            self.loss_T = (1 - 1 * self.opt.onlyOCR) * self.loss_G_ + self.loss_OCR_fake  # + self.loss_w_fake
+            self.loss_T.backward(retain_graph=True)
+            grad_fake_OCR = torch.autograd.grad(self.loss_OCR_fake, self.fake, create_graph=False, retain_graph=True)[0]
+            grad_fake_adv = torch.autograd.grad(self.loss_G_, self.fake, create_graph=False, retain_graph=True)[0]
+            self.loss_grad_fake_OCR = 10 ** 6 * torch.mean(grad_fake_OCR ** 2)
+            self.loss_grad_fake_adv = 10 ** 6 * torch.mean(grad_fake_adv ** 2)
+            with torch.no_grad():
+                self.loss_T.backward()
+        else:
+            self.loss_T.backward()
+        if self.opt.clip_grad > 0:
+            clip_grad_norm_(self.netG.parameters(), self.opt.clip_grad)
+        if any(torch.isnan(loss_OCR_fake)) or torch.isnan(self.loss_G_):
+            print('loss OCR fake: ', loss_OCR_fake, ' loss_G: ', self.loss_G, ' words: ', self.words)
+            sys.exit()
+    def optimize_D_OCR(self):
+        self.forward()
+        self.set_requires_grad([self.netD], True)
+        self.set_requires_grad([self.netOCR], True)
+        self.optimizer_D.zero_grad()
+        # if self.opt.OCR_init in ['glorot', 'xavier', 'ortho', 'N02']:
+        self.optimizer_OCR.zero_grad()
+        self.backward_D_OCR()
+    def optimize_D_OCR_step(self):
+        self.optimizer_D.step()
+        self.optimizer_OCR.step()
+        self.optimizer_D.zero_grad()
+        self.optimizer_OCR.zero_grad()
+    def optimize_G_WL(self):
+        self.forward()
+        self.set_requires_grad([self.netD], False)
+        self.set_requires_grad([self.netOCR], False)
+        self.set_requires_grad([self.netW], False)
+        self.backward_G_WL()
+    def optimize_G_only(self):
+        self.forward()
+        self.set_requires_grad([self.netD], False)
+        self.set_requires_grad([self.netOCR], False)
+        self.set_requires_grad([self.netW], False)
+        self.backward_G_only()
+    def optimize_G_step(self):
+        self.optimizer_G.step()
+        self.optimizer_G.zero_grad()
+    def save_networks(self, epoch, save_dir):
+        """Save all the networks to the disk.
+        Parameters:
+            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+        """
+        for name in self.model_names:
+            if isinstance(name, str):
+                save_filename = '%s_net_%s.pth' % (epoch, name)
+                save_path = os.path.join(save_dir, save_filename)
+                net = getattr(self, 'net' + name)
+                if len(self.gpu_ids) > 0 and torch.cuda.is_available():
+                    # torch.save(net.module.cpu().state_dict(), save_path)
+                    if len(self.gpu_ids) > 1:
+                        torch.save(net.module.cpu().state_dict(), save_path)
+                    else:
+                        torch.save(net.cpu().state_dict(), save_path)
+                    net.cuda(self.gpu_ids[0])
+                else:
+                    torch.save(net.cpu().state_dict(), save_path)
+    def compute_d_scores(self, data_loader: torch.utils.data.DataLoader, amount: int = None):
+        scores = []
+        words = []
+        amount = len(data_loader) if amount is None else amount // data_loader.batch_size
+        with torch.no_grad():
+            for i in range(amount):
+                data = next(iter(data_loader))
+                words.extend([d.decode() for d in data['label']])
+                scores.extend(list(self.netD(data['img'].to(self.args.device)).squeeze().detach().cpu().numpy()))
+        return scores, words
+    def compute_d_scores_fake(self, data_loader: torch.utils.data.DataLoader, amount: int = None):
+        scores = []
+        words = []
+        amount = len(data_loader) if amount is None else amount // data_loader.batch_size
+        with torch.no_grad():
+            for i in range(amount):
+                data = next(iter(data_loader))
+                to_generate = [self.word_generator.generate().encode('utf-8') for _ in range(data_loader.batch_size)]
+                text_encode_fake, len_text_fake, encode_pos_fake = self.netconverter.encode(to_generate)
+                fake, _ = self.netG(data['simg'].to(self.args.device), text_encode_fake.to(self.args.device))
+                words.extend([d.decode() for d in to_generate])
+                scores.extend(list(self.netD(fake).squeeze().detach().cpu().numpy()))
+        return scores, words
+    def compute_d_stats(self, train_loader: torch.utils.data.DataLoader, val_loader: torch.utils.data.DataLoader):
+        train_values = []
+        val_values = []
+        fake_values = []
+        with torch.no_grad():
+            for i in range(self.rv_sample_size // train_loader.batch_size):
+                data = next(iter(train_loader))
+                train_values.append(self.netD(data['img'].to(self.args.device)).squeeze().detach().cpu().numpy())
+            for i in range(self.rv_sample_size // val_loader.batch_size):
+                data = next(iter(val_loader))
+                val_values.append(self.netD(data['img'].to(self.args.device)).squeeze().detach().cpu().numpy())
+            for i in range(self.rv_sample_size):
+                data = self.last_fakes[i]
+                fake_values.append(self.netD(data.unsqueeze(0)).squeeze().detach().cpu().numpy())
+        return np.mean(train_values), np.mean(val_values), np.mean(fake_values)

models/networks.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+from torch.nn import init
+import functools
+from torch.optim import lr_scheduler
+from util.util import to_device, load_network
+###############################################################################
+# Helper Functions
+###############################################################################
+def init_weights(net, init_type='normal', init_gain=0.02):
+    """Initialize network weights.
+    Parameters:
+        net (network)   -- network to be initialized
+        init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
+        init_gain (float)    -- scaling factor for normal, xavier and orthogonal.
+    We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might
+    work better for some applications. Feel free to try yourself.
+    """
+    def init_func(m):  # define the initialization function
+        classname = m.__class__.__name__
+        if (isinstance(m, nn.Conv2d)
+                or isinstance(m, nn.Linear)
+                or isinstance(m, nn.Embedding)):
+        # if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
+            if init_type == 'N02':
+                init.normal_(m.weight.data, 0.0, init_gain)
+            elif init_type in ['glorot', 'xavier']:
+                init.xavier_normal_(m.weight.data, gain=init_gain)
+            elif init_type == 'kaiming':
+                init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+            elif init_type == 'ortho':
+                init.orthogonal_(m.weight.data, gain=init_gain)
+            else:
+                raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
+            # if hasattr(m, 'bias') and m.bias is not None:
+            #     init.constant_(m.bias.data, 0.0)
+        # elif classname.find('BatchNorm2d') != -1:  # BatchNorm Layer's weight is not a matrix; only normal distribution applies.
+        #     init.normal_(m.weight.data, 1.0, init_gain)
+        #     init.constant_(m.bias.data, 0.0)
+    if init_type in ['N02', 'glorot', 'xavier', 'kaiming', 'ortho']:
+        # print('initialize network with %s' % init_type)
+        net.apply(init_func)  # apply the initialization function <init_func>
+    else:
+        # print('loading the model from %s' % init_type)
+        net = load_network(net, init_type, 'latest')
+    return net
+def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]):
+    """Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights
+    Parameters:
+        net (network)      -- the network to be initialized
+        init_type (str)    -- the name of an initialization method: normal | xavier | kaiming | orthogonal
+        gain (float)       -- scaling factor for normal, xavier and orthogonal.
+        gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
+    Return an initialized network.
+    """
+    if len(gpu_ids) > 0:
+        assert(torch.cuda.is_available())
+        net.to(gpu_ids[0])
+        net = torch.nn.DataParallel(net, gpu_ids)  # multi-GPUs
+    init_weights(net, init_type, init_gain=init_gain)
+    return net
+def get_scheduler(optimizer, opt):
+    """Return a learning rate scheduler
+    Parameters:
+        optimizer          -- the optimizer of the network
+        opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions．
+                              opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
+    For 'linear', we keep the same learning rate for the first <opt.niter> epochs
+    and linearly decay the rate to zero over the next <opt.niter_decay> epochs.
+    For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
+    See https://pytorch.org/docs/stable/optim.html for more details.
+    """
+    if opt.lr_policy == 'linear':
+        def lambda_rule(epoch):
+            lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1)
+            return lr_l
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+    elif opt.lr_policy == 'step':
+        scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1)
+    elif opt.lr_policy == 'plateau':
+        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
+    elif opt.lr_policy == 'cosine':
+        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0)
+    else:
+        return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
+    return scheduler

models/positional_encodings.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import numpy as np
+import torch
+import torch.nn as nn
+def get_emb(sin_inp):
+    """
+    Gets a base embedding for one dimension with sin and cos intertwined
+    """
+    emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
+    return torch.flatten(emb, -2, -1)
+class PositionalEncoding1D(nn.Module):
+    def __init__(self, channels):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding1D, self).__init__()
+        self.org_channels = channels
+        channels = int(np.ceil(channels / 2) * 2)
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_penc = None
+    def forward(self, tensor):
+        """
+        :param tensor: A 3d tensor of size (batch_size, x, ch)
+        :return: Positional Encoding Matrix of size (batch_size, x, ch)
+        """
+        if len(tensor.shape) != 3:
+            raise RuntimeError("The input tensor has to be 3d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        batch_size, x, orig_ch = tensor.shape
+        pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
+        sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+        emb_x = get_emb(sin_inp_x)
+        emb = torch.zeros((x, self.channels), device=tensor.device).type(tensor.type())
+        emb[:, : self.channels] = emb_x
+        self.cached_penc = emb[None, :, :orig_ch].repeat(batch_size, 1, 1)
+        return self.cached_penc
+class PositionalEncodingPermute1D(nn.Module):
+    def __init__(self, channels):
+        """
+        Accepts (batchsize, ch, x) instead of (batchsize, x, ch)
+        """
+        super(PositionalEncodingPermute1D, self).__init__()
+        self.penc = PositionalEncoding1D(channels)
+    def forward(self, tensor):
+        tensor = tensor.permute(0, 2, 1)
+        enc = self.penc(tensor)
+        return enc.permute(0, 2, 1)
+    @property
+    def org_channels(self):
+        return self.penc.org_channels
+class PositionalEncoding2D(nn.Module):
+    def __init__(self, channels):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding2D, self).__init__()
+        self.org_channels = channels
+        channels = int(np.ceil(channels / 4) * 2)
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_penc = None
+    def forward(self, tensor):
+        """
+        :param tensor: A 4d tensor of size (batch_size, x, y, ch)
+        :return: Positional Encoding Matrix of size (batch_size, x, y, ch)
+        """
+        if len(tensor.shape) != 4:
+            raise RuntimeError("The input tensor has to be 4d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        batch_size, x, y, orig_ch = tensor.shape
+        pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
+        pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
+        sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+        sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
+        emb_x = get_emb(sin_inp_x).unsqueeze(1)
+        emb_y = get_emb(sin_inp_y)
+        emb = torch.zeros((x, y, self.channels * 2), device=tensor.device).type(
+            tensor.type()
+        )
+        emb[:, :, : self.channels] = emb_x
+        emb[:, :, self.channels : 2 * self.channels] = emb_y
+        self.cached_penc = emb[None, :, :, :orig_ch].repeat(tensor.shape[0], 1, 1, 1)
+        return self.cached_penc
+class PositionalEncodingPermute2D(nn.Module):
+    def __init__(self, channels):
+        """
+        Accepts (batchsize, ch, x, y) instead of (batchsize, x, y, ch)
+        """
+        super(PositionalEncodingPermute2D, self).__init__()
+        self.penc = PositionalEncoding2D(channels)
+    def forward(self, tensor):
+        tensor = tensor.permute(0, 2, 3, 1)
+        enc = self.penc(tensor)
+        return enc.permute(0, 3, 1, 2)
+    @property
+    def org_channels(self):
+        return self.penc.org_channels
+class PositionalEncoding3D(nn.Module):
+    def __init__(self, channels):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding3D, self).__init__()
+        self.org_channels = channels
+        channels = int(np.ceil(channels / 6) * 2)
+        if channels % 2:
+            channels += 1
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_penc = None
+    def forward(self, tensor):
+        """
+        :param tensor: A 5d tensor of size (batch_size, x, y, z, ch)
+        :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch)
+        """
+        if len(tensor.shape) != 5:
+            raise RuntimeError("The input tensor has to be 5d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        batch_size, x, y, z, orig_ch = tensor.shape
+        pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type())
+        pos_y = torch.arange(y, device=tensor.device).type(self.inv_freq.type())
+        pos_z = torch.arange(z, device=tensor.device).type(self.inv_freq.type())
+        sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+        sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
+        sin_inp_z = torch.einsum("i,j->ij", pos_z, self.inv_freq)
+        emb_x = get_emb(sin_inp_x).unsqueeze(1).unsqueeze(1)
+        emb_y = get_emb(sin_inp_y).unsqueeze(1)
+        emb_z = get_emb(sin_inp_z)
+        emb = torch.zeros((x, y, z, self.channels * 3), device=tensor.device).type(
+            tensor.type()
+        )
+        emb[:, :, :, : self.channels] = emb_x
+        emb[:, :, :, self.channels : 2 * self.channels] = emb_y
+        emb[:, :, :, 2 * self.channels :] = emb_z
+        self.cached_penc = emb[None, :, :, :, :orig_ch].repeat(batch_size, 1, 1, 1, 1)
+        return self.cached_penc
+class PositionalEncodingPermute3D(nn.Module):
+    def __init__(self, channels):
+        """
+        Accepts (batchsize, ch, x, y, z) instead of (batchsize, x, y, z, ch)
+        """
+        super(PositionalEncodingPermute3D, self).__init__()
+        self.penc = PositionalEncoding3D(channels)
+    def forward(self, tensor):
+        tensor = tensor.permute(0, 2, 3, 4, 1)
+        enc = self.penc(tensor)
+        return enc.permute(0, 4, 1, 2, 3)
+    @property
+    def org_channels(self):
+        return self.penc.org_channels
+class Summer(nn.Module):
+    def __init__(self, penc):
+        """
+        :param model: The type of positional encoding to run the summer on.
+        """
+        super(Summer, self).__init__()
+        self.penc = penc
+    def forward(self, tensor):
+        """
+        :param tensor: A 3, 4 or 5d tensor that matches the model output size
+        :return: Positional Encoding Matrix summed to the original tensor
+        """
+        penc = self.penc(tensor)
+        assert (
+            tensor.size() == penc.size()
+        ), "The original tensor size {} and the positional encoding tensor size {} must match!".format(
+            tensor.size(), penc.size()
+        )
+        return tensor + penc
+class SparsePositionalEncoding2D(PositionalEncoding2D):
+    def __init__(self, channels, x, y, device='cuda'):
+        super(SparsePositionalEncoding2D, self).__init__(channels)
+        self.y, self.x = y, x
+        self.fake_tensor = torch.zeros((1, x, y, channels), device=device)
+    def forward(self, coords):
+        """
+        :param coords: A list of list of coordinates (((x1, y1), (x2, y22), ... ), ... )
+        :return: Positional Encoding Matrix summed to the original tensor
+        """
+        encodings = super().forward(self.fake_tensor)
+        encodings = encodings.permute(0, 3, 1, 2)
+        indices = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(c) for c in coords], batch_first=True, padding_value=-1)
+        indices = indices.unsqueeze(0).to(self.fake_tensor.device)
+        assert self.x == self.y
+        indices = (indices + 0.5) / self.x * 2 - 1
+        indices = torch.flip(indices, (-1, ))
+        return torch.nn.functional.grid_sample(encodings, indices).squeeze().permute(2, 1, 0)
+        # all_encodings = []
+        # for coords_row in coords:
+        #     res_encodings = []
+        #     for xy in coords_row:
+        #         if xy is None:
+        #             res_encodings.append(padding)
+        #         else:
+        #             x, y = xy
+        #             res_encodings.append(encodings[x, y, :])
+        #     all_encodings.append(res_encodings)
+        # return torch.stack(res_encodings).to(self.fake_tensor.device)
+        # coords = torch.Tensor(coords).to(self.fake_tensor.device).long()
+        # assert torch.all(coords[:, 0] < self.x)
+        # assert torch.all(coords[:, 1] < self.y)
+        # coords = coords[:, 0] + (coords[:, 1] * self.x)
+        # encodings = super().forward(self.fake_tensor).reshape((-1, self.org_channels))
+        # return encodings[coords]
+if __name__ == '__main__':
+    pos = SparsePositionalEncoding2D(10, 10, 20)
+    pos([[0, 0], [0, 9], [1, 0], [9, 15]])